[PATCH net-next 08/11] rocker: Flip to the new dev walk API

2016-10-14 Thread David Ahern
Convert rocker to the new dev walk API. This is just a code conversion;
no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/net/ethernet/rocker/rocker_main.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c 
b/drivers/net/ethernet/rocker/rocker_main.c
index 5424fb341613..9310adc0bcbb 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2839,20 +2839,37 @@ static bool rocker_port_dev_check_under(const struct 
net_device *dev,
return true;
 }
 
+struct rocker_walk_data {
+   struct rocker *rocker;
+   struct rocker_port *port;
+};
+
+static int rocker_lower_dev_walk(struct net_device *lower_dev, void *_data)
+{
+   struct rocker_walk_data *data = (struct rocker_walk_data *)_data;
+   int ret = 0;
+
+   if (rocker_port_dev_check_under(lower_dev, data->rocker)) {
+   data->port = netdev_priv(lower_dev);
+   ret = 1;
+   }
+
+   return ret;
+}
+
 struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev,
   struct rocker *rocker)
 {
-   struct net_device *lower_dev;
-   struct list_head *iter;
+   struct rocker_walk_data data;
 
if (rocker_port_dev_check_under(dev, rocker))
return netdev_priv(dev);
 
-   netdev_for_each_all_lower_dev(dev, lower_dev, iter) {
-   if (rocker_port_dev_check_under(lower_dev, rocker))
-   return netdev_priv(lower_dev);
-   }
-   return NULL;
+   data.rocker = rocker;
+   data.port = NULL;
+   netdev_walk_all_lower_dev(dev, rocker_lower_dev_walk, );
+
+   return data.port;
 }
 
 static int rocker_netdevice_event(struct notifier_block *unused,
-- 
2.1.4



[PATCH net-next 10/11] net: Add warning if any lower device is still in adjacency list

2016-10-14 Thread David Ahern
Lower list should be empty just like upper.

Signed-off-by: David Ahern 
---
 net/core/dev.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index a012c7266230..99a1cb432945 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5225,6 +5225,20 @@ struct net_device *netdev_master_upper_dev_get(struct 
net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get);
 
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+   ASSERT_RTNL();
+
+   return !list_empty(>adj_list.lower);
+}
+
 void *netdev_adjacent_get_private(struct list_head *adj_list)
 {
struct netdev_adjacent *adj;
@@ -6622,6 +6636,7 @@ static void rollback_registered_many(struct list_head 
*head)
 
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
+   WARN_ON(netdev_has_any_lower_dev(dev));
 
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
-- 
2.1.4



[PATCH net-next 09/11] net: Remove all_adj_list and its references

2016-10-14 Thread David Ahern
Only direct adjacencies are maintained. All upper or lower devices can
be learned via the new walk API which recursively walks the adj_list for
upper devices or lower devices.

Signed-off-by: David Ahern 
---
 include/linux/netdevice.h |  25 -
 net/core/dev.c| 229 +-
 2 files changed, 21 insertions(+), 233 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a5902d995907..458c87631e7f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,7 +1456,6 @@ enum netdev_priv_flags {
  * @ptype_specific: Device-specific, protocol-specific packet handlers
  *
  * @adj_list:  Directly linked devices, like slaves for bonding
- * @all_adj_list:  All linked devices, *including* neighbours
  * @features:  Currently active device features
  * @hw_features:   User-changeable features
  *
@@ -1675,11 +1674,6 @@ struct net_device {
struct list_head lower;
} adj_list;
 
-   struct {
-   struct list_head upper;
-   struct list_head lower;
-   } all_adj_list;
-
netdev_features_t   features;
netdev_features_t   hw_features;
netdev_features_t   wanted_features;
@@ -3771,13 +3765,6 @@ struct net_device 
*netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 updev; \
 updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))
 
-/* iterate through upper list, must be called under RCU read lock */
-#define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \
-   for (iter = &(dev)->all_adj_list.upper, \
-updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)); \
-updev; \
-updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
-
 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
  int (*fn)(struct net_device *upper_dev,
void *data),
@@ -3817,18 +3804,6 @@ struct net_device *netdev_all_lower_get_next(struct 
net_device *dev,
 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 struct list_head **iter);
 
-#define netdev_for_each_all_lower_dev(dev, ldev, iter) \
-   for (iter = (dev)->all_adj_list.lower.next, \
-ldev = netdev_all_lower_get_next(dev, &(iter)); \
-ldev; \
-ldev = netdev_all_lower_get_next(dev, &(iter)))
-
-#define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \
-   for (iter = (dev)->all_adj_list.lower.next, \
-ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \
-ldev; \
-ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
-
 int netdev_walk_all_lower_dev(struct net_device *dev,
  int (*fn)(struct net_device *lower_dev,
void *data),
diff --git a/net/core/dev.c b/net/core/dev.c
index 1780f94ed25f..a012c7266230 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5137,6 +5137,16 @@ static struct netdev_adjacent *__netdev_find_adj(struct 
net_device *adj_dev,
return NULL;
 }
 
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+   struct net_device *dev = (struct net_device *)data;
+
+   if (upper_dev == dev)
+   return 1;
+
+   return 0;
+}
+
 /**
  * netdev_has_upper_dev - Check if device is linked to an upper device
  * @dev: device
@@ -5151,7 +5161,8 @@ bool netdev_has_upper_dev(struct net_device *dev,
 {
ASSERT_RTNL();
 
-   return __netdev_find_adj(upper_dev, >all_adj_list.upper);
+   return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+upper_dev);
 }
 EXPORT_SYMBOL(netdev_has_upper_dev);
 
@@ -5165,16 +5176,6 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
  * The caller must hold rcu lock.
  */
 
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
-{
-   struct net_device *dev = (struct net_device *)data;
-
-   if (upper_dev == dev)
-   return 1;
-
-   return 0;
-}
-
 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
  struct net_device *upper_dev)
 {
@@ -5197,7 +5198,7 @@ static bool netdev_has_any_upper_dev(struct net_device 
*dev)
 {
ASSERT_RTNL();
 
-   return !list_empty(>all_adj_list.upper);
+   return !list_empty(>adj_list.upper);
 }
 
 /**
@@ -5260,32 +5261,6 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct 
net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- 

[PATCH net-next 07/11] mlxsw: Flip to the new dev walk API

2016-10-14 Thread David Ahern
Convert mlxsw users to new dev walk API. This is just a code conversion;
no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 37 --
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 43a5eddc2c11..99805fd3d110 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3092,19 +3092,30 @@ static bool mlxsw_sp_port_dev_check(const struct 
net_device *dev)
return dev->netdev_ops == _sp_port_netdev_ops;
 }
 
+static int mlxsw_lower_dev_walk(struct net_device *lower_dev, void *data)
+{
+   struct mlxsw_sp_port **port = data;
+   int ret = 0;
+
+   if (mlxsw_sp_port_dev_check(lower_dev)) {
+   *port = netdev_priv(lower_dev);
+   ret = 1;
+   }
+
+   return ret;
+}
+
 static struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device 
*dev)
 {
-   struct net_device *lower_dev;
-   struct list_head *iter;
+   struct mlxsw_sp_port *port;
 
if (mlxsw_sp_port_dev_check(dev))
return netdev_priv(dev);
 
-   netdev_for_each_all_lower_dev(dev, lower_dev, iter) {
-   if (mlxsw_sp_port_dev_check(lower_dev))
-   return netdev_priv(lower_dev);
-   }
-   return NULL;
+   port = NULL;
+   netdev_walk_all_lower_dev(dev, mlxsw_lower_dev_walk, );
+
+   return port;
 }
 
 static struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev)
@@ -3117,17 +3128,15 @@ static struct mlxsw_sp *mlxsw_sp_lower_get(struct 
net_device *dev)
 
 static struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct 
net_device *dev)
 {
-   struct net_device *lower_dev;
-   struct list_head *iter;
+   struct mlxsw_sp_port *port;
 
if (mlxsw_sp_port_dev_check(dev))
return netdev_priv(dev);
 
-   netdev_for_each_all_lower_dev_rcu(dev, lower_dev, iter) {
-   if (mlxsw_sp_port_dev_check(lower_dev))
-   return netdev_priv(lower_dev);
-   }
-   return NULL;
+   port = NULL;
+   netdev_walk_all_lower_dev_rcu(dev, mlxsw_lower_dev_walk, );
+
+   return port;
 }
 
 struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev)
-- 
2.1.4



[PATCH net-next 06/11] ixgbe: Flip to the new dev walk API

2016-10-14 Thread David Ahern
Convert ixgbe users to new dev walk API. This is just a code conversion;
no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 132 --
 1 file changed, 82 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 784b0b98ab2f..f380fda11eb6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5012,24 +5012,23 @@ static int ixgbe_fwd_ring_up(struct net_device *vdev,
return err;
 }
 
-static void ixgbe_configure_dfwd(struct ixgbe_adapter *adapter)
+static int ixgbe_upper_dev_walk(struct net_device *upper, void *data)
 {
-   struct net_device *upper;
-   struct list_head *iter;
-   int err;
-
-   netdev_for_each_all_upper_dev_rcu(adapter->netdev, upper, iter) {
-   if (netif_is_macvlan(upper)) {
-   struct macvlan_dev *dfwd = netdev_priv(upper);
-   struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
+   if (netif_is_macvlan(upper)) {
+   struct macvlan_dev *dfwd = netdev_priv(upper);
+   struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
 
-   if (dfwd->fwd_priv) {
-   err = ixgbe_fwd_ring_up(upper, vadapter);
-   if (err)
-   continue;
-   }
-   }
+   if (dfwd->fwd_priv)
+   ixgbe_fwd_ring_up(upper, vadapter);
}
+
+   return 0;
+}
+
+static void ixgbe_configure_dfwd(struct ixgbe_adapter *adapter)
+{
+   netdev_walk_all_upper_dev_rcu(adapter->netdev,
+ ixgbe_upper_dev_walk, NULL);
 }
 
 static void ixgbe_configure(struct ixgbe_adapter *adapter)
@@ -5448,12 +5447,25 @@ static void ixgbe_fdir_filter_exit(struct ixgbe_adapter 
*adapter)
spin_unlock(>fdir_perfect_lock);
 }
 
+static int ixgbe_disable_macvlan(struct net_device *upper, void *data)
+{
+   if (netif_is_macvlan(upper)) {
+   struct macvlan_dev *vlan = netdev_priv(upper);
+
+   if (vlan->fwd_priv) {
+   netif_tx_stop_all_queues(upper);
+   netif_carrier_off(upper);
+   netif_tx_disable(upper);
+   }
+   }
+
+   return 0;
+}
+
 void ixgbe_down(struct ixgbe_adapter *adapter)
 {
struct net_device *netdev = adapter->netdev;
struct ixgbe_hw *hw = >hw;
-   struct net_device *upper;
-   struct list_head *iter;
int i;
 
/* signal that we are down to the interrupt handler */
@@ -5477,17 +5489,8 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
netif_tx_disable(netdev);
 
/* disable any upper devices */
-   netdev_for_each_all_upper_dev_rcu(adapter->netdev, upper, iter) {
-   if (netif_is_macvlan(upper)) {
-   struct macvlan_dev *vlan = netdev_priv(upper);
-
-   if (vlan->fwd_priv) {
-   netif_tx_stop_all_queues(upper);
-   netif_carrier_off(upper);
-   netif_tx_disable(upper);
-   }
-   }
-   }
+   netdev_walk_all_upper_dev_rcu(adapter->netdev,
+ ixgbe_disable_macvlan, NULL);
 
ixgbe_irq_disable(adapter);
 
@@ -6728,6 +6731,18 @@ static void ixgbe_update_default_up(struct ixgbe_adapter 
*adapter)
 #endif
 }
 
+static int ixgbe_enable_macvlan(struct net_device *upper, void *data)
+{
+   if (netif_is_macvlan(upper)) {
+   struct macvlan_dev *vlan = netdev_priv(upper);
+
+   if (vlan->fwd_priv)
+   netif_tx_wake_all_queues(upper);
+   }
+
+   return 0;
+}
+
 /**
  * ixgbe_watchdog_link_is_up - update netif_carrier status and
  * print link up message
@@ -6737,8 +6752,6 @@ static void ixgbe_watchdog_link_is_up(struct 
ixgbe_adapter *adapter)
 {
struct net_device *netdev = adapter->netdev;
struct ixgbe_hw *hw = >hw;
-   struct net_device *upper;
-   struct list_head *iter;
u32 link_speed = adapter->link_speed;
const char *speed_str;
bool flow_rx, flow_tx;
@@ -6809,14 +6822,8 @@ static void ixgbe_watchdog_link_is_up(struct 
ixgbe_adapter *adapter)
 
/* enable any upper devices */
rtnl_lock();
-   netdev_for_each_all_upper_dev_rcu(adapter->netdev, upper, iter) {
-   if (netif_is_macvlan(upper)) {
-   struct macvlan_dev *vlan = netdev_priv(upper);
-
-   if (vlan->fwd_priv)
-   netif_tx_wake_all_queues(upper);
-   }
-   }
+   

[PATCH net-next 11/11] net: dev: Improve debug statements for adjacency tracking

2016-10-14 Thread David Ahern
Adjacency code only has debugs for the insert case. Add debugs for
the remove path and make both consistently worded to make it easier
to follow the insert and removal with reference counts.

In addition, change the BUG to a WARN_ON. A missing adjacency at
removal time is not cause for a panic.

Signed-off-by: David Ahern 
---
 net/core/dev.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 99a1cb432945..10fd42a833e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5567,6 +5567,9 @@ static int __netdev_adjacent_dev_insert(struct net_device 
*dev,
 
if (adj) {
adj->ref_nr += 1;
+   pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+dev->name, adj_dev->name, adj->ref_nr);
+
return 0;
}
 
@@ -5580,8 +5583,8 @@ static int __netdev_adjacent_dev_insert(struct net_device 
*dev,
adj->private = private;
dev_hold(adj_dev);
 
-   pr_debug("dev_hold for %s, because of link added from %s to %s\n",
-adj_dev->name, dev->name, adj_dev->name);
+   pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold 
on %s\n",
+dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5620,17 +5623,22 @@ static void __netdev_adjacent_dev_remove(struct 
net_device *dev,
 {
struct netdev_adjacent *adj;
 
+   pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+dev->name, adj_dev->name, ref_nr);
+
adj = __netdev_find_adj(adj_dev, dev_list);
 
if (!adj) {
-   pr_err("tried to remove device %s from %s\n",
+   pr_err("Adjacency does not exist for device %s from %s\n",
   dev->name, adj_dev->name);
-   BUG();
+   WARN_ON(1);
+   return;
}
 
if (adj->ref_nr > ref_nr) {
-   pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
-ref_nr, adj->ref_nr-ref_nr);
+   pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+dev->name, adj_dev->name, ref_nr,
+adj->ref_nr - ref_nr);
adj->ref_nr -= ref_nr;
return;
}
@@ -5642,7 +5650,7 @@ static void __netdev_adjacent_dev_remove(struct 
net_device *dev,
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 
list_del_rcu(>list);
-   pr_debug("dev_put for %s, because link removed from %s to %s\n",
+   pr_debug("adjacency: dev_put for %s, because link removed from %s to 
%s\n",
 adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
-- 
2.1.4



[PATCH net-next 02/11] net: Introduce new api for walking upper and lower devices

2016-10-14 Thread David Ahern
This patch introduces netdev_walk_all_upper_dev_rcu,
netdev_walk_all_lower_dev and netdev_walk_all_lower_dev_rcu. These
functions recursively walk the adj_list of devices to determine all upper
and lower devices.

The functions take a callback function that is invoked for each device
in the list. If the callback returns non-0, the walk is terminated and
the functions return that code back to callers.

v2
- fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev
  version.

Signed-off-by: David Ahern 
---
 include/linux/netdevice.h |  17 +
 net/core/dev.c| 161 ++
 2 files changed, 178 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf341b65ca5e..a5902d995907 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3778,6 +3778,14 @@ struct net_device 
*netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 updev; \
 updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
 
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *upper_dev,
+   void *data),
+ void *data);
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev);
+
 void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter);
 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
@@ -3821,6 +3829,15 @@ struct net_device *netdev_all_lower_get_next_rcu(struct 
net_device *dev,
 ldev; \
 ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
 
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *lower_dev,
+   void *data),
+ void *data);
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *lower_dev,
+   void *data),
+ void *data);
+
 void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5399af8fdac0..1780f94ed25f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5156,6 +5156,37 @@ bool netdev_has_upper_dev(struct net_device *dev,
 EXPORT_SYMBOL(netdev_has_upper_dev);
 
 /**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+   struct net_device *dev = (struct net_device *)data;
+
+   if (upper_dev == dev)
+   return 1;
+
+   return 0;
+}
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+   if (netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev))
+   return true;
+
+   return false;
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
  * netdev_has_any_upper_dev - Check if device is linked to some device
  * @dev: device
  *
@@ -5255,6 +5286,51 @@ struct net_device 
*netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
 
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+   struct list_head **iter)
+{
+   struct netdev_adjacent *upper;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+   upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+   if (>list == >adj_list.upper)
+   return NULL;
+
+   *iter = >list;
+
+   return upper->dev;
+}
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+   void *data),
+ void *data)
+{
+   struct net_device *udev;
+   struct list_head *iter;
+   int ret;
+
+   for (iter = >adj_list.upper,
+udev = netdev_next_upper_dev_rcu(dev, );
+udev;
+udev = netdev_next_upper_dev_rcu(dev, )) {
+   /* first is the upper device itself */
+   ret = fn(udev, data);
+   if (ret)
+   return ret;
+
+

[PATCH net-next 05/11] IB/ipoib: Flip to new dev walk API

2016-10-14 Thread David Ahern
Convert ipoib_get_net_dev_match_addr to the new upper device walk API.
This is just a code conversion; no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 37 +--
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 5636fc3da6b8..624855ab7205 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -292,6 +292,25 @@ static struct net_device *ipoib_get_master_net_dev(struct 
net_device *dev)
return dev;
 }
 
+struct ipoib_walk_data {
+   const struct sockaddr *addr;
+   struct net_device *result;
+};
+
+static int ipoib_upper_walk(struct net_device *upper, void *_data)
+{
+   struct ipoib_walk_data *data = (struct ipoib_walk_data *)_data;
+   int ret = 0;
+
+   if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
+   dev_hold(upper);
+   data->result = upper;
+   ret = 1;
+   }
+
+   return ret;
+}
+
 /**
  * Find a net_device matching the given address, which is an upper device of
  * the given net_device.
@@ -304,27 +323,21 @@ static struct net_device *ipoib_get_master_net_dev(struct 
net_device *dev)
 static struct net_device *ipoib_get_net_dev_match_addr(
const struct sockaddr *addr, struct net_device *dev)
 {
-   struct net_device *upper,
- *result = NULL;
-   struct list_head *iter;
+   struct ipoib_walk_data data = {
+   .addr = addr,
+   };
 
rcu_read_lock();
if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
dev_hold(dev);
-   result = dev;
+   data.result = dev;
goto out;
}
 
-   netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
-   if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
-   dev_hold(upper);
-   result = upper;
-   break;
-   }
-   }
+   netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, );
 out:
rcu_read_unlock();
-   return result;
+   return data.result;
 }
 
 /* returns the number of IPoIB netdevs on top a given ipoib device matching a
-- 
2.1.4



[PATCH net-next 03/11] net: bonding: Flip to the new dev walk API

2016-10-14 Thread David Ahern
Convert alb_send_learning_packets and bond_has_this_ip to use the new
netdev_walk_all_upper_dev_rcu API. In both cases this is just a code
conversion; no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/net/bonding/bond_alb.c  | 82 ++---
 drivers/net/bonding/bond_main.c | 21 +++
 2 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 551f0f8dead3..1ddedec61900 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -950,13 +950,61 @@ static void alb_send_lp_vid(struct slave *slave, u8 
mac_addr[],
dev_queue_xmit(skb);
 }
 
+struct alb_walk_data {
+   struct bonding *bond;
+   struct slave *slave;
+   u8 *mac_addr;
+   bool strict_match;
+};
+
+static int alb_upper_dev_walk(struct net_device *upper, void *data)
+{
+   struct alb_walk_data *_data = (struct alb_walk_data *)data;
+   bool strict_match = _data->strict_match;
+   struct bonding *bond = _data->bond;
+   struct slave *slave = _data->slave;
+   u8 *mac_addr = _data->mac_addr;
+   struct bond_vlan_tag *tags;
+
+   if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
+   if (strict_match &&
+   ether_addr_equal_64bits(mac_addr,
+   upper->dev_addr)) {
+   alb_send_lp_vid(slave, mac_addr,
+   vlan_dev_vlan_proto(upper),
+   vlan_dev_vlan_id(upper));
+   } else if (!strict_match) {
+   alb_send_lp_vid(slave, upper->dev_addr,
+   vlan_dev_vlan_proto(upper),
+   vlan_dev_vlan_id(upper));
+   }
+   }
+
+   /* If this is a macvlan device, then only send updates
+* when strict_match is turned off.
+*/
+   if (netif_is_macvlan(upper) && !strict_match) {
+   tags = bond_verify_device_path(bond->dev, upper, 0);
+   if (IS_ERR_OR_NULL(tags))
+   BUG();
+   alb_send_lp_vid(slave, upper->dev_addr,
+   tags[0].vlan_proto, tags[0].vlan_id);
+   kfree(tags);
+   }
+
+   return 0;
+}
+
 static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[],
  bool strict_match)
 {
struct bonding *bond = bond_get_bond_by_slave(slave);
-   struct net_device *upper;
-   struct list_head *iter;
-   struct bond_vlan_tag *tags;
+   struct alb_walk_data data = {
+   .strict_match = strict_match,
+   .mac_addr = mac_addr,
+   .slave = slave,
+   .bond = bond,
+   };
 
/* send untagged */
alb_send_lp_vid(slave, mac_addr, 0, 0);
@@ -965,33 +1013,7 @@ static void alb_send_learning_packets(struct slave 
*slave, u8 mac_addr[],
 * for that device.
 */
rcu_read_lock();
-   netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) {
-   if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
-   if (strict_match &&
-   ether_addr_equal_64bits(mac_addr,
-   upper->dev_addr)) {
-   alb_send_lp_vid(slave, mac_addr,
-   vlan_dev_vlan_proto(upper),
-   vlan_dev_vlan_id(upper));
-   } else if (!strict_match) {
-   alb_send_lp_vid(slave, upper->dev_addr,
-   vlan_dev_vlan_proto(upper),
-   vlan_dev_vlan_id(upper));
-   }
-   }
-
-   /* If this is a macvlan device, then only send updates
-* when strict_match is turned off.
-*/
-   if (netif_is_macvlan(upper) && !strict_match) {
-   tags = bond_verify_device_path(bond->dev, upper, 0);
-   if (IS_ERR_OR_NULL(tags))
-   BUG();
-   alb_send_lp_vid(slave, upper->dev_addr,
-   tags[0].vlan_proto, tags[0].vlan_id);
-   kfree(tags);
-   }
-   }
+   netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, );
rcu_read_unlock();
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5fa36ebc0640..89191b019178 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2270,22 +2270,27 @@ static void bond_mii_monitor(struct work_struct *work)
}
 }
 

[PATCH net-next 04/11] IB/core: Flip to the new dev walk API

2016-10-14 Thread David Ahern
Convert rdma_is_upper_dev_rcu, handle_netdev_upper and
ipoib_get_net_dev_match_addr to the new upper device walk API.
This is just a code conversion; no functional change is intended.

Signed-off-by: David Ahern 
---
 drivers/infiniband/core/core_priv.h |  9 +--
 drivers/infiniband/core/roce_gid_mgmt.c | 42 ++---
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h 
b/drivers/infiniband/core/core_priv.h
index 19d499dcab76..0c0bea091de8 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -127,14 +127,7 @@ void ib_cache_release_one(struct ib_device *device);
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
 struct net_device *upper)
 {
-   struct net_device *_upper = NULL;
-   struct list_head *iter;
-
-   netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
-   if (_upper == upper)
-   break;
-
-   return _upper == upper;
+   return netdev_has_upper_dev_all_rcu(dev, upper);
 }
 
 int addr_init(void);
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c 
b/drivers/infiniband/core/roce_gid_mgmt.c
index 06556c34606d..db759a68d948 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -437,6 +437,28 @@ static void callback_for_addr_gid_device_scan(struct 
ib_device *device,
  >gid_attr);
 }
 
+struct upper_list {
+   struct list_head list;
+   struct net_device *upper;
+};
+
+static int netdev_upper_walk(struct net_device *upper, void *data)
+{
+   struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+   struct list_head *upper_list = (struct list_head *)data;
+
+   if (!entry) {
+   pr_info("roce_gid_mgmt: couldn't allocate entry to delete 
ndev\n");
+   return 0;
+   }
+
+   list_add_tail(>list, upper_list);
+   dev_hold(upper);
+   entry->upper = upper;
+
+   return 0;
+}
+
 static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
void *cookie,
void (*handle_netdev)(struct ib_device *ib_dev,
@@ -444,30 +466,12 @@ static void handle_netdev_upper(struct ib_device *ib_dev, 
u8 port,
  struct net_device *ndev))
 {
struct net_device *ndev = (struct net_device *)cookie;
-   struct upper_list {
-   struct list_head list;
-   struct net_device *upper;
-   };
-   struct net_device *upper;
-   struct list_head *iter;
struct upper_list *upper_iter;
struct upper_list *upper_temp;
LIST_HEAD(upper_list);
 
rcu_read_lock();
-   netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
-   struct upper_list *entry = kmalloc(sizeof(*entry),
-  GFP_ATOMIC);
-
-   if (!entry) {
-   pr_info("roce_gid_mgmt: couldn't allocate entry to 
delete ndev\n");
-   continue;
-   }
-
-   list_add_tail(>list, _list);
-   dev_hold(upper);
-   entry->upper = upper;
-   }
+   netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, _list);
rcu_read_unlock();
 
handle_netdev(ib_dev, port, ndev);
-- 
2.1.4



[PATCH net-next 01/11] net: Remove refnr arg when inserting link adjacencies

2016-10-14 Thread David Ahern
Commit 93409033ae65 ("net: Add netdev all_adj_list refcnt propagation to
fix panic") propagated the refnr to insert and remove functions tracking
the netdev adjacency graph. However, for the insert path the refnr can
only be 1. Accordingly, remove the refnr argument to make that clear.
ie., the refnr arg in 93409033ae65 was only needed for the remove path.

Signed-off-by: David Ahern 
---
 net/core/dev.c | 27 ---
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 6498cc2ba8f6..5399af8fdac0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5453,7 +5453,6 @@ static inline bool netdev_adjacent_is_neigh_list(struct 
net_device *dev,
 
 static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
-   u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
 {
@@ -5463,7 +5462,7 @@ static int __netdev_adjacent_dev_insert(struct net_device 
*dev,
adj = __netdev_find_adj(adj_dev, dev_list);
 
if (adj) {
-   adj->ref_nr += ref_nr;
+   adj->ref_nr += 1;
return 0;
}
 
@@ -5473,7 +5472,7 @@ static int __netdev_adjacent_dev_insert(struct net_device 
*dev,
 
adj->dev = adj_dev;
adj->master = master;
-   adj->ref_nr = ref_nr;
+   adj->ref_nr = 1;
adj->private = private;
dev_hold(adj_dev);
 
@@ -5547,22 +5546,21 @@ static void __netdev_adjacent_dev_remove(struct 
net_device *dev,
 
 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
-   u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
 {
int ret;
 
-   ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+   ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
   private, master);
if (ret)
return ret;
 
-   ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+   ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
   private, false);
if (ret) {
-   __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+   __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
 
@@ -5570,10 +5568,9 @@ static int __netdev_adjacent_dev_link_lists(struct 
net_device *dev,
 }
 
 static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev,
- u16 ref_nr)
+ struct net_device *upper_dev)
 {
-   return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
+   return __netdev_adjacent_dev_link_lists(dev, upper_dev,
>all_adj_list.upper,
_dev->all_adj_list.lower,
NULL, false);
@@ -5602,12 +5599,12 @@ static int __netdev_adjacent_dev_link_neighbour(struct 
net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
 {
-   int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
+   int ret = __netdev_adjacent_dev_link(dev, upper_dev);
 
if (ret)
return ret;
 
-   ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
+   ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
   >adj_list.upper,
   _dev->adj_list.lower,
   private, master);
@@ -5676,7 +5673,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
list_for_each_entry(j, _dev->all_adj_list.upper, list) {
pr_debug("Interlinking %s with %s, non-neighbour\n",
 i->dev->name, j->dev->name);
-   ret = __netdev_adjacent_dev_link(i->dev, j->dev, 
i->ref_nr);
+   ret = __netdev_adjacent_dev_link(i->dev, j->dev);
if (ret)
goto rollback_mesh;
}
@@ -5686,7 +5683,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
list_for_each_entry(i, _dev->all_adj_list.upper, list) {
pr_debug("linking %s's upper 

[PATCH v2 net-next 00/11] net: Fix netdev adjacency tracking

2016-10-14 Thread David Ahern
The netdev adjacency tracking is failing to create proper dependencies
for some topologies. For example this topology

++
|  myvrf |
++
  ||
  |  +-+
  |  | macvlan |
  |  +-+
  ||
  +--+
  |  bridge  |
  +--+
  |
  ++
  | bond1  |
  ++
  |
  ++
  |  eth3  |
  ++

hits 1 of 2 problems depending on the order of enslavement. The base set of
commands for both cases:

ip link add bond1 type bond
ip link set bond1 up
ip link set eth3 down
ip link set eth3 master bond1
ip link set eth3 up

ip link add bridge type bridge
ip link set bridge up
ip link add macvlan link bridge type macvlan
ip link set macvlan up

ip link add myvrf type vrf table 1234
ip link set myvrf up

ip link set bridge master myvrf

Case 1 enslave macvlan to the vrf before enslaving the bond to the bridge:

ip link set macvlan master myvrf
ip link set bond1 master bridge

Attempts to delete the VRF:
ip link delete myvrf

trigger the BUG in __netdev_adjacent_dev_remove:

[  587.405260] tried to remove device eth3 from myvrf
[  587.407269] [ cut here ]
[  587.408918] kernel BUG at /home/dsa/kernel.git/net/core/dev.c:5661!
[  587.43] invalid opcode:  [#1] SMP
[  587.412454] Modules linked in: macvlan bridge stp llc bonding vrf
[  587.414765] CPU: 0 PID: 726 Comm: ip Not tainted 4.8.0+ #109
[  587.416766] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.7.5-20140531_083030-gandalf 04/01/2014
[  587.420241] task: 88013ab6eec0 task.stack: c9628000
[  587.422163] RIP: 0010:[]  [] 
__netdev_adjacent_dev_remove+0x40/0x12c
...
[  587.446053] Call Trace:
[  587.446424]  [] __netdev_adjacent_dev_unlink+0x20/0x3c
[  587.447390]  [] netdev_upper_dev_unlink+0xfa/0x15e
[  587.448297]  [] vrf_del_slave+0x13/0x2a [vrf]
[  587.449153]  [] vrf_dev_uninit+0xea/0x114 [vrf]
[  587.450036]  [] rollback_registered_many+0x22b/0x2da
[  587.450974]  [] unregister_netdevice_many+0x17/0x48
[  587.451903]  [] rtnl_delete_link+0x3c/0x43
[  587.452719]  [] rtnl_dellink+0x180/0x194

When the BUG is converted to a WARN_ON it shows 4 missing adjacencies:
  eth3 - myvrf, mvrf - eth3, bond1 - myvrf and myvrf - bond1

All of those are because the __netdev_upper_dev_link function does not
properly link macvlan lower devices to myvrf when it is enslaved.

The second case just flips the ordering of the enslavements:
ip link set bond1 master bridge
ip link set macvlan master myvrf

Then run:
ip link delete bond1
ip link delete myvrf

The vrf delete command hangs because myvrf has a reference that has not
been released. In this case the removal code does not account for 2 paths 
between eth3 and myvrf - one from bridge to vrf and the other through the
macvlan.

Rather than try to maintain a linked list of all upper and lower devices
per netdevice, only track the direct neighbors. The remaining stack can
be determined by recursively walking the neighbors.

The existing netdev_for_each_all_upper_dev_rcu,
netdev_for_each_all_lower_dev and netdev_for_each_all_lower_dev_rcu macros
are replaced with APIs that walk the upper and lower device lists. The
new APIs take a callback function and a data arg that is passed to the
callback for each device in the list. Drivers using the old macros are
converted in separate patches to make it easier on reviewers. It is an
API conversion only; no functional change is intended.

DaveM: Given the impact of this bug (both cases requiring a reboot) I
would like to get this backported to at least the 4.8 tree which as I 
understand it has been targeted as the next LTS.

v2
- fixed bond0 references in cover-letter
- fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev
  version.

David Ahern (11):
  net: Remove refnr arg when inserting link adjacencies
  net: Introduce new api for walking upper and lower devices
  net: bonding: Flip to the new dev walk API
  IB/core: Flip to the new dev walk API
  IB/ipoib: Flip to new dev walk API
  ixgbe: Flip to the new dev walk API
  mlxsw: Flip to the new dev walk API
  rocker: Flip to the new dev walk API
  net: Remove all_adj_list and its references
  net: Add warning if any lower device is still in adjacency list
  net: dev: Improve debug statements for adjacency tracking

 drivers/infiniband/core/core_priv.h|   9 +-
 drivers/infiniband/core/roce_gid_mgmt.c|  42 +--
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |  37 ++-
 drivers/net/bonding/bond_alb.c |  82 +++---
 drivers/net/bonding/bond_main.c|  21 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 132 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |  37 ++-
 drivers/net/ethernet/rocker/rocker_main.c  |  31 ++-
 include/linux/netdevice.h

Re: [PATCH] ethtool: Zero memory allocated for statistics

2016-10-14 Thread David Miller
From: Vlad Tsyrklevich 
Date: Fri, 14 Oct 2016 11:59:18 +0200

> enic_get_ethtool_stats()

Looknig merely at this shows the real problem.

We don't propagate and handle errors for this method.

And that's what we should fix, making the get_ethtool_stats() method
return an integer error.

Then ethtool_get_stats() should return any non-zero value provided by
ops->get_ethtool_stats() and not attempt to copy any bytes of 'data'
to userspace in that case.


[PATCH 2/2] sunbmac: Fix compiler warning

2016-10-14 Thread Tushar Dave
sunbmac uses '__u32' for dma handle while invoking kernel DMA APIs,
instead of using dma_addr_t. This hasn't caused any 'incompatible
pointer type' warning on SPARC because until now dma_addr_t is of
type u32. However, recent changes in SPARC ATU (iommu) enables 64bit
DMA and therefore dma_addr_t becomes of type u64. This makes
'incompatible pointer type' warnings inevitable.

e.g.
drivers/net/ethernet/sun/sunbmac.c: In function ‘bigmac_ether_init’:
drivers/net/ethernet/sun/sunbmac.c:1166: warning: passing argument 3 of 
‘dma_alloc_coherent’ from incompatible pointer type
./include/linux/dma-mapping.h:445: note: expected ‘dma_addr_t *’ but argument 
is of type ‘__u32 *’

This patch resolves above compiler warning.

Signed-off-by: Tushar Dave 
Reviewed-by: chris hyser 
---
 drivers/net/ethernet/sun/sunbmac.c | 5 +++--
 drivers/net/ethernet/sun/sunbmac.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunbmac.c 
b/drivers/net/ethernet/sun/sunbmac.c
index aa4f9d2..02f4527 100644
--- a/drivers/net/ethernet/sun/sunbmac.c
+++ b/drivers/net/ethernet/sun/sunbmac.c
@@ -623,6 +623,7 @@ static int bigmac_init_hw(struct bigmac *bp, int from_irq)
void __iomem *gregs= bp->gregs;
void __iomem *cregs= bp->creg;
void __iomem *bregs= bp->bregs;
+   __u32 bblk_dvma = (__u32)bp->bblock_dvma;
unsigned char *e = >dev->dev_addr[0];
 
/* Latch current counters into statistics. */
@@ -671,9 +672,9 @@ static int bigmac_init_hw(struct bigmac *bp, int from_irq)
bregs + BMAC_XIFCFG);
 
/* Tell the QEC where the ring descriptors are. */
-   sbus_writel(bp->bblock_dvma + bib_offset(be_rxd, 0),
+   sbus_writel(bblk_dvma + bib_offset(be_rxd, 0),
cregs + CREG_RXDS);
-   sbus_writel(bp->bblock_dvma + bib_offset(be_txd, 0),
+   sbus_writel(bblk_dvma + bib_offset(be_txd, 0),
cregs + CREG_TXDS);
 
/* Setup the FIFO pointers into QEC local memory. */
diff --git a/drivers/net/ethernet/sun/sunbmac.h 
b/drivers/net/ethernet/sun/sunbmac.h
index 06dd217..532fc56 100644
--- a/drivers/net/ethernet/sun/sunbmac.h
+++ b/drivers/net/ethernet/sun/sunbmac.h
@@ -291,7 +291,7 @@ struct bigmac {
void __iomem*bregs; /* BigMAC Registers   */
void __iomem*tregs; /* BigMAC Transceiver */
struct bmac_init_block  *bmac_block;/* RX and TX descriptors */
-   __u32bblock_dvma;   /* RX and TX descriptors */
+   dma_addr_t  bblock_dvma;/* RX and TX descriptors */
 
spinlock_t  lock;
 
-- 
1.9.1



[PATCH 0/2] net: Fix compiler warnings

2016-10-14 Thread Tushar Dave
Recently, ATU (iommu) changes are submitted to linux-sparc that
enables 64bit DMA on SPARC. However, this change also makes
'incompatible pointer type' compiler warnings inevitable on sunqe
and sunbmac driver.

The two patches in series fix compiler warnings.

Tushar Dave (2):
  sunqe: Fix compiler warnings
  sunbmac: Fix compiler warning

 drivers/net/ethernet/sun/sunbmac.c |  5 +++--
 drivers/net/ethernet/sun/sunbmac.h |  2 +-
 drivers/net/ethernet/sun/sunqe.c   | 11 ++-
 drivers/net/ethernet/sun/sunqe.h   |  4 ++--
 4 files changed, 12 insertions(+), 10 deletions(-)

-- 
1.9.1



[PATCH 1/2] sunqe: Fix compiler warnings

2016-10-14 Thread Tushar Dave
sunqe uses '__u32' for dma handle while invoking kernel DMA APIs,
instead of using dma_addr_t. This hasn't caused any 'incompatible
pointer type' warning on SPARC because until now dma_addr_t is of
type u32. However, recent changes in SPARC ATU (iommu) enables 64bit
DMA and therefore dma_addr_t becomes of type u64. This makes
'incompatible pointer type' warnings inevitable.

e.g.
drivers/net/ethernet/sun/sunqe.c: In function ‘qec_ether_init’:
drivers/net/ethernet/sun/sunqe.c:883: warning: passing argument 3 of 
‘dma_alloc_coherent’ from incompatible pointer type
./include/linux/dma-mapping.h:445: note: expected ‘dma_addr_t *’ but argument 
is of type ‘__u32 *’
drivers/net/ethernet/sun/sunqe.c:885: warning: passing argument 3 of 
‘dma_alloc_coherent’ from incompatible pointer type
./include/linux/dma-mapping.h:445: note: expected ‘dma_addr_t *’ but argument 
is of type ‘__u32 *’

This patch resolves above compiler warnings.

Signed-off-by: Tushar Dave 
Reviewed-by: chris hyser 
---
 drivers/net/ethernet/sun/sunqe.c | 11 ++-
 drivers/net/ethernet/sun/sunqe.h |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c
index 9b825780..9582948 100644
--- a/drivers/net/ethernet/sun/sunqe.c
+++ b/drivers/net/ethernet/sun/sunqe.c
@@ -124,7 +124,7 @@ static void qe_init_rings(struct sunqe *qep)
 {
struct qe_init_block *qb = qep->qe_block;
struct sunqe_buffers *qbufs = qep->buffers;
-   __u32 qbufs_dvma = qep->buffers_dvma;
+   __u32 qbufs_dvma = (__u32)qep->buffers_dvma;
int i;
 
qep->rx_new = qep->rx_old = qep->tx_new = qep->tx_old = 0;
@@ -144,6 +144,7 @@ static int qe_init(struct sunqe *qep, int from_irq)
void __iomem *mregs = qep->mregs;
void __iomem *gregs = qecp->gregs;
unsigned char *e = >dev->dev_addr[0];
+   __u32 qblk_dvma = (__u32)qep->qblock_dvma;
u32 tmp;
int i;
 
@@ -152,8 +153,8 @@ static int qe_init(struct sunqe *qep, int from_irq)
return -EAGAIN;
 
/* Setup initial rx/tx init block pointers. */
-   sbus_writel(qep->qblock_dvma + qib_offset(qe_rxd, 0), cregs + 
CREG_RXDS);
-   sbus_writel(qep->qblock_dvma + qib_offset(qe_txd, 0), cregs + 
CREG_TXDS);
+   sbus_writel(qblk_dvma + qib_offset(qe_rxd, 0), cregs + CREG_RXDS);
+   sbus_writel(qblk_dvma + qib_offset(qe_txd, 0), cregs + CREG_TXDS);
 
/* Enable/mask the various irq's. */
sbus_writel(0, cregs + CREG_RIMASK);
@@ -413,7 +414,7 @@ static void qe_rx(struct sunqe *qep)
struct net_device *dev = qep->dev;
struct qe_rxd *this;
struct sunqe_buffers *qbufs = qep->buffers;
-   __u32 qbufs_dvma = qep->buffers_dvma;
+   __u32 qbufs_dvma = (__u32)qep->buffers_dvma;
int elem = qep->rx_new;
u32 flags;
 
@@ -572,7 +573,7 @@ static int qe_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
 {
struct sunqe *qep = netdev_priv(dev);
struct sunqe_buffers *qbufs = qep->buffers;
-   __u32 txbuf_dvma, qbufs_dvma = qep->buffers_dvma;
+   __u32 txbuf_dvma, qbufs_dvma = (__u32)qep->buffers_dvma;
unsigned char *txbuf;
int len, entry;
 
diff --git a/drivers/net/ethernet/sun/sunqe.h b/drivers/net/ethernet/sun/sunqe.h
index 581781b..ae190b7 100644
--- a/drivers/net/ethernet/sun/sunqe.h
+++ b/drivers/net/ethernet/sun/sunqe.h
@@ -334,12 +334,12 @@ struct sunqe {
void __iomem*qcregs;/* QEC 
per-channel Registers   */
void __iomem*mregs; /* Per-channel MACE 
Registers  */
struct qe_init_block*qe_block;  /* RX and TX 
descriptors   */
-   __u32   qblock_dvma;/* RX and TX 
descriptors   */
+   dma_addr_t  qblock_dvma;/* RX and TX 
descriptors   */
spinlock_t  lock;   /* Protects txfull 
state   */
int rx_new, rx_old; /* RX ring extents  
   */
int tx_new, tx_old; /* TX ring extents  
   */
struct sunqe_buffers*buffers;   /* CPU visible address. 
   */
-   __u32   buffers_dvma;   /* DVMA visible 
address.   */
+   dma_addr_t  buffers_dvma;   /* DVMA visible 
address.   */
struct sunqec   *parent;
u8  mconfig;/* Base MACE mconfig 
value */
struct platform_device  *op;/* QE's OF device 
struct   */
-- 
1.9.1



Re: bug in ixgbe_atr

2016-10-14 Thread Sowmini Varadhan
On (10/14/16 16:09), Duyck, Alexander H wrote:
> Sorry I was thinking of a different piece of code.  In the case of the
> atr code it would be hdr.network, not hdr.raw.  Basically the thought
> was to validate that there is enough data in skb_headlen() that we can
> verify that from where the network header should be we have at least
> 40 bytes of data as that would be the minimum needed for a TCP header
> and an IPv4 header, or just an IPv6 header.  We would probably need a
> separate follow-up for the TCP header after we validate network header.
   :
>> Dropping it is fine with me I guess - maybe just return, if the
>> skb_headlen() doesnt have enough bytes for a network header, i.e.,
>> skb_headlen
>> is at least ETH_HLEN + sizeof (struct iphdr) for ETH_P_IP, or  ETH_HLEN +
>> sizeof (struct ipv6hdr) for ETH_P_IPV6?

> Right that is kind of what I was thinking.  If we validate that we
> have at least 40 before inspecting the network header, and at least 20
> before we validate the TCP header that would work for me.

yes, I was on a plane through most of the day today but thought about
this. I think we can check if skb_network_offset() is between
skb->data and tail, and also make sure there are "enough" bytes for
trying to find the ip and transport header. 
Let me try to put a RFC patch together for this tomorrow.


Layer 2 over IPv6 GRE and path MTU discovery

2016-10-14 Thread Mike Walker
When using a layer 2 GREv6 tunnel (ip6gretap), I am using a Linux
bridge to push Ethernet frames from an Ethernet port to the GREv6
device.

Here is an example of the topology:

PC -> eth0 -> grebridge -> gre6dev -> (internet) -> GRE endpoint -> Remote host

In this case, the PC connected to the Ethernet port is using IPv6 to
communicate with the remote host, so the source and destination IP of
the traffic being sent by the PC are both IPv6 addresses.  So we have
an IPv6 header, Ethernet header, then GRE header once the
encapsulation is done.

Sometimes these packets are too large for the GRE tunnel's MTU.  When
this happens, the router's kernel wants to send an ICMP "packet too
big" error message back to the PC.

However, the router has no routing information for the PC.  The path
from the PC to the remote host is all supposed to be layer 2.  The
router is not configured to route traffic to the PC or the remote
host, only to bridge the layer 2 frames.

What happens then is Linux tries to send an ICMP error, it can't find
the route, or else it sends it to its default route, none of which do
any good.

If the PC doesn't get this ICMP error, it will not know why the
packets were dropped, or it won't even know they were dropped.  It's
an ICMP blackhole scenario right?

So, one solution I tried was hacking the kernel so that if it's trying
to send this ICMP "packet too big" error to a host, and we know it's a
layer 2 GRE tunnel, instead of the normal logic, force the ICMP error
message to be sent back out via the network interface the offending
packet was received on.

This mostly worked, the PC recieves the ICMP error and adjusts its
path MTU, so in the future it will know to fragment the packet if it's
too big.

Problem is, I don't know what source IP and mac address I should be
using when I send back this ICMP error to the PC.  Normally this
network path doesn't have any layer 3 address, and even the mac
address normally is transparent / unknown to the PC.  For my prototype
I simply set the source IP of the ICMP error to whatever was the
destination IP of the packet that was too big.  I let the kernel use
the mac address of either the bridge or eth0.

I couldn't seem to find any RFC that says how this should be handled.
Any ideas?


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-14 Thread Richard Cochran
On Fri, Oct 14, 2016 at 08:58:22AM +, Koehrer Mathias (ETAS/ESW5) wrote:
> @@ -753,7 +756,9 @@ u32 igb_rd32(struct e1000_hw *hw, u32 re
>   if (E1000_REMOVED(hw_addr))
>   return ~value;
>  
> +trace_igb(801);
>   value = readl(_addr[reg]);
> +trace_igb(802);

Nothing prevents this code from being preempted between the two trace
points, and so you can't be sure whether the time delta in the trace
is caused by the PCIe read stalling or not.

Thanks,
Richard




Re: [PATCH] net: limit a number of namespaces which can be cleaned up concurrently

2016-10-14 Thread Andrei Vagin
On Thu, Oct 13, 2016 at 10:06:28PM -0500, Eric W. Biederman wrote:
> Andrei Vagin  writes:
> 
> > On Thu, Oct 13, 2016 at 10:49:38AM -0500, Eric W. Biederman wrote:
> >> Andrei Vagin  writes:
> >> 
> >> > From: Andrey Vagin 
> >> >
> >> > The operation of destroying netns is heavy and it is executed under
> >> > net_mutex. If many namespaces are destroyed concurrently, net_mutex can
> >> > be locked for a long time. It is impossible to create a new netns during
> >> > this period of time.
> >> 
> >> This may be the right approach or at least the right approach to bound
> >> net_mutex hold times but I have to take exception to calling network
> >> namespace cleanup heavy.
> >> 
> >> The only particularly time consuming operation I have ever found are calls 
> >> to
> >> synchronize_rcu/sycrhonize_sched/synchronize_net.
> >
> > I booted the kernel with maxcpus=1, in this case these functions work
> > very fast and the problem is there any way.
> >
> > Accoding to perf, we spend a lot of time in kobject_uevent:
> >
> > -   99.96% 0.00%  kworker/u4:1 [kernel.kallsyms]  [k] 
> > unregister_netdevice_many
> >- unregister_netdevice_many
> >   - 99.95% rollback_registered_many
> >  - 99.64% netdev_unregister_kobject
> > - 33.43% netdev_queue_update_kobjects
> >- 33.40% kobject_put
> >   - kobject_release
> >  + 33.37% kobject_uevent
> >  + 0.03% kobject_del
> >+ 0.03% sysfs_remove_group
> > - 33.13% net_rx_queue_update_kobjects
> >- kobject_put
> >- kobject_release
> >   + 33.11% kobject_uevent
> >   + 0.01% kobject_del
> > 0.00% rx_queue_release
> > - 33.08% device_del
> >+ 32.75% kobject_uevent
> >+ 0.17% device_remove_attrs
> >+ 0.07% dpm_sysfs_remove
> >+ 0.04% device_remove_class_symlinks
> >+ 0.01% kobject_del
> >+ 0.01% device_pm_remove
> >+ 0.01% sysfs_remove_file_ns
> >+ 0.00% klist_del
> >+ 0.00% driver_deferred_probe_del
> >  0.00% cleanup_glue_dir.isra.14.part.15
> >  0.00% to_acpi_device_node
> >  0.00% sysfs_remove_group
> >   0.00% klist_del
> >   0.00% device_remove_attrs
> >  + 0.26% call_netdevice_notifiers_info
> >  + 0.04% rtmsg_ifinfo_build_skb
> >  + 0.01% rtmsg_ifinfo_send
> > 0.00% dev_uc_flush
> > 0.00% netif_reset_xps_queues_gt
> >
> > Someone can listen these uevents, so we can't stop sending them without
> > breaking backward compatibility. We can try to optimize
> > kobject_uevent...
> 
> Oh that is a surprise.  We can definitely skip genenerating uevents for
> network namespaces that are exiting because by definition no one can see
> those network namespaces.  If a socket existed that could see those
> uevents it would hold a reference to the network namespace and as such
> the network namespace could not exit.
> 
> That sounds like it is worth investigating a little more deeply.
> 
> I am surprised that allocation and freeing is so heavy we are spending
> lots of time doing that.  On the other hand kobj_bcast_filter is very
> dumb and very late so I expect something can be moved earlier and make
> that code cheaper with the tiniest bit of work.
> 

I'm sorry, I've collected this data for a kernel with debug options
(DEBUG_SPINLOCK, PROVE_LOCKING, DEBUG_LIST, etc). If a kernel is
compiled without debug options, kobject_uevent becomes less expensive,
but still expensive.

-   98.64% 0.00%  kworker/u4:2  [kernel.kallsyms][k] cleanup_net
   - cleanup_net
  - 98.54% ops_exit_list.isra.4
 - 60.48% default_device_exit_batch
- 60.40% unregister_netdevice_many
   - rollback_registered_many
  - 59.82% netdev_unregister_kobject
 - 20.10% device_del
+ 19.44% kobject_uevent
+ 0.40% device_remove_attrs
+ 0.17% dpm_sysfs_remove
+ 0.04% device_remove_class_symlinks
+ 0.04% kobject_del
+ 0.01% device_pm_remove
+ 0.01% sysfs_remove_file_ns
 - 19.89% netdev_queue_update_kobjects
+ 19.81% kobject_put
+ 0.07% sysfs_remove_group
 - 19.79% net_rx_queue_update_kobjects
  kobject_put
- kobject_release
   + 19.77% kobject_uevent
   + 0.02% kobject_del
 0.01% rx_queue_release
 + 0.02% kset_unregister
 

[PATCH v2] vmxnet3: avoid assumption about invalid dma_pa in vmxnet3_set_mc()

2016-10-14 Thread Alexey Khoroshilov
vmxnet3_set_mc() checks new_table_pa returned by dma_map_single()
with dma_mapping_error(), but even there it assumes zero is invalid pa
(it assumes dma_mapping_error(...,0) returns true if new_table is NULL).

The patch adds an explicit variable to track status of new_table_pa.

Found by Linux Driver Verification project (linuxtesting.org).

v2: use "bool" and "true"/"false" for boolean variables.
Signed-off-by: Alexey Khoroshilov 
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c 
b/drivers/net/vmxnet3/vmxnet3_drv.c
index b5554f2ebee4..ef83ae3b0a44 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2279,6 +2279,7 @@ vmxnet3_set_mc(struct net_device *netdev)
>shared->devRead.rxFilterConf;
u8 *new_table = NULL;
dma_addr_t new_table_pa = 0;
+   bool new_table_pa_valid = false;
u32 new_mode = VMXNET3_RXM_UCAST;
 
if (netdev->flags & IFF_PROMISC) {
@@ -2307,13 +2308,15 @@ vmxnet3_set_mc(struct net_device *netdev)
new_table,
sz,
PCI_DMA_TODEVICE);
+   if (!dma_mapping_error(>pdev->dev,
+  new_table_pa)) {
+   new_mode |= VMXNET3_RXM_MCAST;
+   new_table_pa_valid = true;
+   rxConf->mfTablePA = cpu_to_le64(
+   new_table_pa);
+   }
}
-
-   if (!dma_mapping_error(>pdev->dev,
-  new_table_pa)) {
-   new_mode |= VMXNET3_RXM_MCAST;
-   rxConf->mfTablePA = cpu_to_le64(new_table_pa);
-   } else {
+   if (!new_table_pa_valid) {
netdev_info(netdev,
"failed to copy mcast list, setting 
ALL_MULTI\n");
new_mode |= VMXNET3_RXM_ALL_MULTI;
@@ -2338,7 +2341,7 @@ vmxnet3_set_mc(struct net_device *netdev)
   VMXNET3_CMD_UPDATE_MAC_FILTERS);
spin_unlock_irqrestore(>cmd_lock, flags);
 
-   if (new_table_pa)
+   if (new_table_pa_valid)
dma_unmap_single(>pdev->dev, new_table_pa,
 rxConf->mfTableLen, PCI_DMA_TODEVICE);
kfree(new_table);
-- 
2.7.4



Re: [PATCH v2] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Francois Romieu
Ard Biesheuvel  :
> PCI devices that are 64-bit DMA capable should set the coherent
> DMA mask as well as the streaming DMA mask. On some architectures,
> these are managed separately, and so the coherent DMA mask will be
> left at its default value of 32 if it is not set explicitly. This
> results in errors such as
> 
>  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>  swiotlb: coherent allocation failed for device :02:00.0 size=4096
>  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
> 
> on systems without memory that is 32-bit addressable by PCI devices.
> 
> Signed-off-by: Ard Biesheuvel 

Acked-by: Francois Romieu 

Unless someone plans to plug an acenic, a 83820 (pci-e gem board, anyone ?)
on top of a pci <-> pci-e adapter on this kind of motherboard, no other
network driver that uses the pci_... dma api exhibits this mixed 32 / 64 bit
support bug. I haven't checked devices with 32 < mask < 64 nor plain DMA api
converted ones.

-- 
Ueimor


Re: [PATCH NET] ethtool: silence warning on bit loss

2016-10-14 Thread David Miller
From: Jesse Brandeburg 
Date: Thu, 13 Oct 2016 16:13:55 -0700

> Sparse was complaining when we went to prototype some code
> using ethtool_cmd_speed_set and SPEED_10, which uses
> the upper 16 bits of __u32 speed for the first time.
> 
> CHECK
> ...
> .../uapi/linux/ethtool.h:123:28: warning:
>   cast truncates bits from constant value (186a0 becomes 86a0)
> 
> The warning is actually bogus, as no bits are really lost, but
> we can get rid of the sparse warning with this one small change.
> 
> Reported-by: Preethi Banala 
> Signed-off-by: Jesse Brandeburg 

Ok, I'll apply this.

There were alternative suggestions but I like this patch
because it makes it explicit what is going on.

Just removing the u16 cast requires the reader to implicitly
understand and know the types in the structure.


Re: pull-request: wireless-drivers 2016-10-14

2016-10-14 Thread David Miller
From: Kalle Valo 
Date: Fri, 14 Oct 2016 10:18:42 +0300

> first wireless-drivers pull request for 4.9 and this time we have
> unusually many fixes even before -rc1 is released. Most important here
> are the wlcore and rtlwifi commits which fix critical regressions,
> otherwise smaller impact fixes and one new sdio id for ath6kl.
> 
> Please let me know if there are any problems.

Pulled, thanks Kalle.


Re: [PATCH] net: asix: Avoid looping when the device does not respond

2016-10-14 Thread David Miller
From: Guenter Roeck 
Date: Thu, 13 Oct 2016 16:43:16 -0700

> Check answers from USB stack and avoid re-sending the request
> multiple times if the device does not respond.
> 
> This fixes the following problem, observed with a probably flaky adapter.
 ...
> Since the USB timeout is 5 seconds, and the operation is retried 30 times,
> this results in
 ...
> Signed-off-by: Guenter Roeck 

Applied, thanks.


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-14 Thread Julia Cartwright
On Fri, Oct 14, 2016 at 08:58:22AM +, Koehrer Mathias (ETAS/ESW5) wrote:
> Hi Julia,
>
> > Have you tested on a vanilla (non-RT) kernel?  I doubt there is anything RT 
> > specific
> > about what you are seeing, but it might be nice to get confirmation.  Also, 
> > bisection
> > would probably be easier if you confirm on a vanilla kernel.
> >
> > I find it unlikely that it's a kernel config option that changed which 
> > regressed you, but
> > instead was a code change to a driver.  Which driver is now the question, 
> > and the
> > surface area is still big (processor mapping attributes for this region, 
> > PCI root
> > complex configuration, PCI brige configuration, igb driver itself, etc.).
> >
> > Big enough that I'd recommend a bisection.  It looks like a bisection 
> > between 3.18
> > and 4.8 would take you about 18 tries to narrow down, assuming all goes 
> > well.
> >
>
> I have now repeated my tests using the vanilla kernel.
> There I got the very same issue.
> Using kernel 4.0 is fine, however starting with kernel 4.1, the issue appears.

Great, thanks for confirming!  That helps narrow things down quite a
bit.

> Here is my exact (reproducible) test description:
> I applied the following patch to the kernel to get the igb trace.
> This patch instruments the igb_rd32() function to measure the call
> to readl() which is used to access registers of the igb NIC.

I took your test setup and ran it between 4.0 and 4.1 on the hardware on
my desk, which is an Atom-based board with dual I210s, however I didn't
see much difference.

However, it's a fairly simple board, with a much simpler PCI topology
than your workstation.  I'll see if I can find some other hardware to
test on.

[..]
> This means, that I think that some other stuff in kernel 4.1 has changed,
> which has impact on the igb accesses.
>
> Any idea what component could cause this kind of issue?

Can you continue your bisection using 'git bisect'?  You've already
narrowed it down between 4.0 and 4.1, so you're well on your way.

Another option might be to try to eliminate igb from the picture as
well, and try reading from another device from the same (or, perhaps
nearest) bus segment, and see if you see the same results.

   Julia


ethtool.h compile warning on c++

2016-10-14 Thread Ben Greear

I am getting warnings about sign missmatch.

Maybe make SPEED_UNKNOWN be ((__u32)(0x)) ?

from ethtool.h:

#define SPEED_UNKNOWN   -1

static inline int ethtool_validate_speed(__u32 speed)
{
return speed <= INT_MAX || speed == SPEED_UNKNOWN;
}

Thanks,
Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



[PATCH v3] net: Require exact match for TCP socket lookups if dif is l3mdev

2016-10-14 Thread David Ahern
Currently, socket lookups for l3mdev (vrf) use cases can match a socket
that is bound to a port but not a device (ie., a global socket). If the
sysctl tcp_l3mdev_accept is not set this leads to ack packets going out
based on the main table even though the packet came in from an L3 domain.
The end result is that the connection does not establish creating
confusion for users since the service is running and a socket shows in
ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the
skb came through an interface enslaved to an l3mdev device and the
tcp_l3mdev_accept is not set.

skb's through an l3mdev interface are marked by setting a flag in
inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the
flag for IPv4. Using an skb flag avoids a device lookup on the dif. The
flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the
inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the
match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the
move is done after the socket lookup, so IP6CB is used.

The flags field in inet_skb_parm struct needs to be increased to add
another flag. There is currently a 1-byte hole following the flags,
so it can be expanded to u16 without increasing the size of the struct.

Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Signed-off-by: David Ahern 
---
v3
- changed the match functions to pull the skb flag from TCP_SKB_CB
  rather than IPCB for IPv4 per changes from 971f10eca186. match
  function is moved to tcp.h as a consequence.
- made flags a u16 versus __u16 for consistency with frag_max_size
- updated commit message

v2
- reordered the checks in inet_exact_dif_match per Eric's comment
- changed the l3mdev determination from looking up the dif to using
  a flag set on the skb which is much faster

 drivers/net/vrf.c   |  2 ++
 include/linux/ipv6.h| 11 +++
 include/net/ip.h|  8 +++-
 include/net/tcp.h   | 11 +++
 net/ipv4/inet_hashtables.c  |  8 +---
 net/ipv6/inet6_hashtables.c |  7 ---
 6 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 85c271c70d42..820de6a9ddde 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -956,6 +956,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
if (skb->pkt_type == PACKET_LOOPBACK) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
+   IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
skb->pkt_type = PACKET_HOST;
goto out;
}
@@ -996,6 +997,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device 
*vrf_dev,
 {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
+   IPCB(skb)->flags |= IPSKB_L3SLAVE;
 
/* loopback traffic; do not push through packet taps again.
 * Reset pkt_type for upper layers to process skb
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7e9a789be5e0..c0644e5f603a 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -144,6 +144,17 @@ static inline int inet6_iif(const struct sk_buff *skb)
return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }
 
+/* can not be used in TCP layer after tcp_v6_fill_cb */
+static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+   if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
+   skb_l3mdev_slave(IP6CB(skb)->flags))
+   return true;
+#endif
+   return false;
+}
+
 struct tcp6_request_sock {
struct tcp_request_sock   tcp6rsk_tcp;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index bc43c0fcae12..64d05d976b22 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -38,7 +38,7 @@ struct sock;
 struct inet_skb_parm {
int iif;
struct ip_options   opt;/* Compiled IP options  
*/
-   unsigned char   flags;
+   u16 flags;
 
 #define IPSKB_FORWARDEDBIT(0)
 #define IPSKB_XFRM_TUNNEL_SIZE BIT(1)
@@ -48,10 +48,16 @@ struct inet_skb_parm {
 #define IPSKB_DOREDIRECT   BIT(5)
 #define IPSKB_FRAG_PMTUBIT(6)
 #define IPSKB_FRAG_SEGSBIT(7)
+#define IPSKB_L3SLAVE  BIT(8)
 
u16 frag_max_size;
 };
 
+static inline bool skb_l3mdev_slave4(u16 flags)
+{
+   return !!(flags & IPSKB_L3SLAVE);
+}
+
 static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
 {
return ip_hdr(skb)->ihl * 4;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f83b7f220a65..5c7bb59dc331 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -800,6 +800,17 @@ static inline int tcp_v6_iif(const struct sk_buff *skb)
 }
 #endif
 
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline bool inet_exact_dif_match(struct 

[patch] stmmac: fix an error code in stmmac_ptp_register()

2016-10-14 Thread Dan Carpenter
PTR_ERR(NULL) is success.  We have to preserve the error code earlier.

Fixes: 7086605a6ab5 ("stmmac: fix error check when init ptp")
Signed-off-by: Dan Carpenter 
---
Applies to net.git tree.

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 289d527..5d61fb2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -185,8 +185,10 @@ int stmmac_ptp_register(struct stmmac_priv *priv)
priv->ptp_clock = ptp_clock_register(>ptp_clock_ops,
 priv->device);
if (IS_ERR(priv->ptp_clock)) {
+   int ret = PTR_ERR(priv->ptp_clock);
+
priv->ptp_clock = NULL;
-   return PTR_ERR(priv->ptp_clock);
+   return ret;
}
 
spin_lock_init(>ptp_lock);


[PATCH] net: qcom/emac: disable interrupts before calling phy_disconnect

2016-10-14 Thread Timur Tabi
There is a race condition that can occur if EMAC interrupts are
enabled when phy_disconnect() is called.  phy_disconnect() sets
adjust_link to NULL.  When an interrupt occurs, the ISR might
call phy_mac_interrupt(), which wakes up the workqueue function
phy_state_machine().  This function might reference adjust_link,
thereby causing a null pointer exception.

Signed-off-by: Timur Tabi 
---
 drivers/net/ethernet/qualcomm/emac/emac-mac.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c 
b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index e97968e..6fb3bee 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -1021,14 +1021,18 @@ void emac_mac_down(struct emac_adapter *adpt)
napi_disable(>rx_q.napi);
 
phy_stop(adpt->phydev);
-   phy_disconnect(adpt->phydev);
 
-   /* disable mac irq */
+   /* Interrupts must be disabled before the PHY is disconnected, to
+* avoid a race condition where adjust_link is null when we get
+* an interrupt.
+*/
writel(DIS_INT, adpt->base + EMAC_INT_STATUS);
writel(0, adpt->base + EMAC_INT_MASK);
synchronize_irq(adpt->irq.irq);
free_irq(adpt->irq.irq, >irq);
 
+   phy_disconnect(adpt->phydev);
+
emac_mac_reset(adpt);
 
emac_tx_q_descs_free(adpt);
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.



Re: [PATCH net-next v11 1/1] net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.

2016-10-14 Thread Allan W. Nielsen
Hi David,

I'm really sorry if I messed up, or is not following the protocol...

But you have applied it already:
http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=4f58e6dceb0e44ca8f21568ed81e1df24e55964c

> commit 4f58e6dceb0e44ca8f21568ed81e1df24e55964c
> Author: Allan W. Nielsen 
> AuthorDate: Wed Oct 12 15:47:51 2016 +0200
> Commit: David S. Miller 
> CommitDate: Fri Oct 14 10:06:13 2016 -0400
> 
> net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.
> 
> Edge-Rate cleanup include the following:
> - Updated device tree bindings documentation for edge-rate
> - The edge-rate is now specified as a "slowdown", meaning that it is now
>   being specified as positive values instead of negative (both
>   documentation and implementation wise).
> - Only explicitly documented values for "vsc8531,vddmac" and
>   "vsc8531,edge-slowdown" are accepted by the device driver.
> - Deleted include/dt-bindings/net/mscc-phy-vsc8531.h as it was not needed.
> - Read/validate devicetree settings in probe instead of init
> 
> Signed-off-by: Allan W. Nielsen 
> Signed-off-by: Raju Lakkaraju 
> Signed-off-by: David S. Miller 

Maybe the misunderstanding was caused by me posting the re-based version in
another thread.

Anyway, thanks a lot for the big effort you put into maintaining this
sub-system. I will be more care full next time to avoid such confusions.

Best regards
Allan W. Nielsen





On 14/10/16 11:05, David Miller wrote:
> EXTERNAL EMAIL
> 
> 
> From: "Allan W. Nielsen" 
> Date: Thu, 13 Oct 2016 20:21:30 +0200
> 
> > Edge-Rate cleanup include the following:
> > - Updated device tree bindings documentation for edge-rate
> > - The edge-rate is now specified as a "slowdown", meaning that it is now
> >   being specified as positive values instead of negative (both
> >   documentation and implementation wise).
> > - Only explicitly documented values for "vsc8531,vddmac" and
> >   "vsc8531,edge-slowdown" are accepted by the device driver.
> > - Deleted include/dt-bindings/net/mscc-phy-vsc8531.h as it was not needed.
> > - Read/validate devicetree settings in probe instead of init
> >
> > Signed-off-by: Allan W. Nielsen 
> > Signed-off-by: Raju Lakkaraju 
> 
> This patch does not apply to the net-next tree.
> 
> Take my tree, put this email of your's into a file, and run this:
> 
> bash$ git am file
> 
> and you will get:
> 
> [davem@dhcp-10-15-49-210 net-next]$ git am --signoff 
> net-next-v11-1-1-net-phy-Cleanup-the-Edge-Rate-feature-in-Microsemi-PHYs..patch
> Applying: net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.
> error: patch failed: 
> Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt:6
> error: Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt: patch does 
> not apply
> error: patch failed: drivers/net/phy/mscc.c:12
> error: drivers/net/phy/mscc.c: patch does not apply
> error: include/dt-bindings/net/mscc-phy-vsc8531.h: does not exist in index
> Patch failed at 0001 net: phy: Cleanup the Edge-Rate feature in Microsemi 
> PHYs.
> The copy of the patch that failed is found in:
>/home/davem/src/GIT/net-next/.git/rebase-apply/patch
> When you have resolved this problem, run "git am --continue".
> If you prefer to skip this patch, run "git am --skip" instead.
> To restore the original branch and stop patching, run "git am --abort".
> 
> Please do not resubmit this patch until you can successfully email the
> patch to yourself and apply it cleanly to the net-next tree.
> 
> 


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:


It is normal to get the phy-mode from device tree. I've no idea what
ACPI is supposed to do. Setting it to PHY_INTERFACE_MODE_NA means you
assume the boot loader has correctly setup the hardware. You ACPI
firmware might of done this, but there is no guarantee a device tree
base bootloader has. So i would prefer not changing this.


Fair enough.  I don't think it's specified anywhere what firmware is 
supposed to do.


What about specifying PHY_INTERFACE_MODE_NA on ACPI systems, but using 
the phy-mode property on device tree systems?  That doesn't sound like a 
great idea.



>I don't see any other driver issue BMCR_PDOWN in their functions.  I
>added some printks for the PHYSID1 and PHYSID2 registers before and
>after BMCR_PDOWN:
>
>at803x_suspend:235 MII_PHYSID1=004d MII_PHYSID2=d074
>at803x_suspend:242 MII_PHYSID1= MII_PHYSID2=
>
>So after calling BMCR_PDOWN, the PHYSID1 and PHYSID2 registers are
>no longer readable.  Is that expected?

You are making two changes here. Is it the SGMII power down which is
causing the id registers to return 0x, or the BMCR_PDOWN.

The generic suspend code sets the PDOWN bit, so it is assuming the PHY
will respond afterwards.


Ok, it took me a while to figure this out.  The driver does three writes:

phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr & ~AT803X_BT_BX_REG_SEL);
phy_write(phydev, MII_BMCR, phy_read(phydev, MII_BMCR) | BMCR_PDOWN);
phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr | AT803X_BT_BX_REG_SEL);

The first clears the AT803X_BT_BX_REG_SEL bit.  According to the 
datasheet, that changes the register set from copper to fiber mode. 
BMCR_PDOWN in fiber mode shuts off the SerDes bus.  That's not true in 
copper mode.


Then after shutting down SerDes, it switches back to copper mode.

I also noticed the at803x_suspend already sends BMCR_PDOWN in copper 
mode earlier in the function.


So the question remains: should drivers shut down the SerDes bus when 
they suspend?  In a sense, I'm wondering if we should revert


at803x: fix suspend/resume for SGMII link

However, the changelog for that patch makes it sound like it's a 
necessary fix.


So I'm torn.  With the SerDes connection disabled, the driver no longer 
responds to ID register reads.  That seems like something that would be 
broken on device tree as well, but I don't understand why no one noticed 
it before.


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


[PATCH v2 net-next 2/2] ila: Cache a route to translated address

2016-10-14 Thread Tom Herbert
Add a dst_cache to ila_lwt structure. This holds a cached route for the
translated address. In ila_output we now perform a route lookup after
translation and if possible (destination in original route is full 128
bits) we set the dst_cache. Subsequent calls to ila_output can then use
the cache to avoid the route lookup.

This eliminates the need to set the gateway on ILA routes as previously
was being done. Now we can do something like:

./ip route add ::2000:0:0:2/128 encap ila :0:0:2 \
csum-mode neutral-map dev eth0  ## No via needed!

Signed-off-by: Tom Herbert 
---
 net/ipv6/ila/ila_lwt.c | 80 ++
 1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index e50c27a..d0a98d9 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -6,29 +6,80 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include "ila.h"
 
+struct ila_lwt {
+   struct ila_params p;
+   struct dst_cache dst_cache;
+   u32 connected : 1;
+};
+
+static inline struct ila_lwt *ila_lwt_lwtunnel(
+   struct lwtunnel_state *lwt)
+{
+   return (struct ila_lwt *)lwt->data;
+}
+
 static inline struct ila_params *ila_params_lwtunnel(
-   struct lwtunnel_state *lwstate)
+   struct lwtunnel_state *lwt)
 {
-   return (struct ila_params *)lwstate->data;
+   return _lwt_lwtunnel(lwt)->p;
 }
 
 static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-   struct dst_entry *dst = skb_dst(skb);
+   struct dst_entry *orig_dst = skb_dst(skb);
+   struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
+   struct dst_entry *dst;
+   int err = -EINVAL;
 
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
 
-   ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true);
+   ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
+   true);
+
+   dst = dst_cache_get(>dst_cache);
+   if (unlikely(!dst)) {
+   struct ipv6hdr *ip6h = ipv6_hdr(skb);
+   struct flowi6 fl6;
+
+   /* Lookup a route for the new destination. Take into
+* account that the base route may already have a gateway.
+*/
+
+   memset(, 0, sizeof(fl6));
+   fl6.flowi6_oif = orig_dst->dev->ifindex;
+   fl6.flowi6_iif = LOOPBACK_IFINDEX;
+   fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
+>daddr);
+
+   dst = ip6_route_output(net, NULL, );
+   if (dst->error) {
+   err = -EHOSTUNREACH;
+   dst_release(dst);
+   goto drop;
+   }
+
+   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), NULL, 0);
+   if (IS_ERR(dst)) {
+   err = PTR_ERR(dst);
+   goto drop;
+   }
+
+   if (ilwt->connected)
+   dst_cache_set_ip6(>dst_cache, dst, );
+   }
 
-   return dst->lwtstate->orig_output(net, sk, skb);
+   skb_dst_set(skb, dst);
+   return dst_output(net, sk, skb);
 
 drop:
kfree_skb(skb);
@@ -60,6 +111,7 @@ static int ila_build_state(struct net_device *dev, struct 
nlattr *nla,
   unsigned int family, const void *cfg,
   struct lwtunnel_state **ts)
 {
+   struct ila_lwt *ilwt;
struct ila_params *p;
struct nlattr *tb[ILA_ATTR_MAX + 1];
size_t encap_len = sizeof(*p);
@@ -71,7 +123,7 @@ static int ila_build_state(struct net_device *dev, struct 
nlattr *nla,
if (family != AF_INET6)
return -EINVAL;
 
-   if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) {
+   if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
/* Need to have full locator and at least type field
 * included in destination
 */
@@ -99,6 +151,13 @@ static int ila_build_state(struct net_device *dev, struct 
nlattr *nla,
if (!newts)
return -ENOMEM;
 
+   ilwt = ila_lwt_lwtunnel(newts);
+   ret = dst_cache_init(>dst_cache, GFP_ATOMIC);
+   if (ret) {
+   kfree(newts);
+   return ret;
+   }
+
newts->len = encap_len;
p = ila_params_lwtunnel(newts);
 
@@ -120,11 +179,19 @@ static int ila_build_state(struct net_device *dev, struct 
nlattr *nla,
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
LWTUNNEL_STATE_INPUT_REDIRECT;
 
+   if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
+   ilwt->connected = 1;
+
*ts = newts;
 
return 0;
 }
 
+static void ila_destroy_state(struct 

[PATCH v2 net-next 1/2] lwtunnel: Add destroy state operation

2016-10-14 Thread Tom Herbert
Users of lwt tunnels may set up some secondary state in build_state
function. Add a corresponding destroy_state function to allow users to
clean up state. This destroy state function is called from lwstate_free.
Also, we now free lwstate using kfree_rcu so user can assume structure
is not freed before rcu.

Acked-by: Roopa Prabhu 
Signed-off-by: Tom Herbert 
---
 include/net/lwtunnel.h |  7 +++
 net/core/lwtunnel.c| 13 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index ea3f80f..67d235f 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -29,6 +29,7 @@ struct lwtunnel_state {
int (*orig_input)(struct sk_buff *);
int len;
__u16   headroom;
+   struct  rcu_head rcu;
__u8data[0];
 };
 
@@ -36,6 +37,7 @@ struct lwtunnel_encap_ops {
int (*build_state)(struct net_device *dev, struct nlattr *encap,
   unsigned int family, const void *cfg,
   struct lwtunnel_state **ts);
+   void (*destroy_state)(struct lwtunnel_state *lws);
int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
int (*input)(struct sk_buff *skb);
int (*fill_encap)(struct sk_buff *skb,
@@ -46,10 +48,7 @@ struct lwtunnel_encap_ops {
 };
 
 #ifdef CONFIG_LWTUNNEL
-static inline void lwtstate_free(struct lwtunnel_state *lws)
-{
-   kfree(lws);
-}
+void lwtstate_free(struct lwtunnel_state *lws);
 
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index e5f84c2..88fd642 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -130,6 +130,19 @@ int lwtunnel_build_state(struct net_device *dev, u16 
encap_type,
 }
 EXPORT_SYMBOL(lwtunnel_build_state);
 
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+   const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+   if (ops->destroy_state) {
+   ops->destroy_state(lws);
+   kfree_rcu(lws, rcu);
+   } else {
+   kfree(lws);
+   }
+}
+EXPORT_SYMBOL(lwtstate_free);
+
 int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
 {
const struct lwtunnel_encap_ops *ops;
-- 
2.9.3



[PATCH v2 net-next 0/2] ila: Cache a route in ILA lwt structure

2016-10-14 Thread Tom Herbert
Add a dst_cache to ila_lwt structure. This holds a cached route for the
translated address. In ila_output we now perform a route lookup after
translation and if possible (destination in original route is full 128
bits) we set the dst_cache. Subsequent calls to ila_output can then use
the cache to avoid the route lookup.

This eliminates the need to set the gateway on ILA routes as previously
was being done. Now we can do somthing like:

./ip route add ::2000:0:0:2/128 encap ila :0:0:2 \
csum-mode neutral-map dev eth0  ## No via needed!

Also, add destroy_state to lwt ops. We need this do destroy the
dst_cache.

- v2
  - Fixed comparisons to fc_dst_len to make comparison against number
of bits in data structure not bytes.
  - Move destroy_state under build_state (requested by Jiri)
  - Other minor cleanup

Tested:

Running 200 TCP_RR streams:

  Baseline, no ILA

1730716 tps
102/170/313 50/90/99% latencies
88.11 CPU utilization

  Using ILA in both directions

1680428 tps
105/176/325 50/90/99% latencies
88.16 CPU utilization
Tom Herbert (2):
  lwtunnel: Add destroy state operation
  ila: Cache a route to translated address

 include/net/lwtunnel.h |  7 ++---
 net/core/lwtunnel.c| 13 
 net/ipv6/ila/ila_lwt.c | 80 ++
 3 files changed, 90 insertions(+), 10 deletions(-)

-- 
2.9.3



Re: Need help with mdiobus_register and phy

2016-10-14 Thread Andrew Lunn
On Fri, Oct 14, 2016 at 11:57:29AM -0500, Timur Tabi wrote:
> Andrew Lunn wrote:
> >That is a basic assumption of the code. If you cannot read the IDs how
> >are you supposed to know what device it is, and what quirks you need
> >to work around its broken features...
> >
> >Does the datasheet say anything about this?
> >
> >I would say for this device, suspend() is too aggressive.
> 
> This change in my driver makes the problem go away (I'm not sure if
> it's a "fix"):
> 
> @@ -992,7 +992,7 @@ int emac_mac_up(struct emac_adapter *adpt)
> emac_mac_rx_descs_refill(adpt, >rx_q);
> 
> ret = phy_connect_direct(netdev, adpt->phydev, emac_adjust_link,
> -PHY_INTERFACE_MODE_SGMII);
> +PHY_INTERFACE_MODE_NA);

It is normal to get the phy-mode from device tree. I've no idea what
ACPI is supposed to do. Setting it to PHY_INTERFACE_MODE_NA means you
assume the boot loader has correctly setup the hardware. You ACPI
firmware might of done this, but there is no guarantee a device tree
base bootloader has. So i would prefer not changing this.
 
> With the interface not set as SGMII, the following code in
> at803x_suspend() is not executed:
> 
> /* also power-down SGMII interface */
> ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
> phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr & ~AT803X_BT_BX_REG_SEL);
> phy_write(phydev, MII_BMCR, phy_read(phydev, MII_BMCR) | BMCR_PDOWN);
> phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr | AT803X_BT_BX_REG_SEL);
> 
> I don't see any other driver issue BMCR_PDOWN in their functions.  I
> added some printks for the PHYSID1 and PHYSID2 registers before and
> after BMCR_PDOWN:
> 
> at803x_suspend:235 MII_PHYSID1=004d MII_PHYSID2=d074
> at803x_suspend:242 MII_PHYSID1= MII_PHYSID2=
> 
> So after calling BMCR_PDOWN, the PHYSID1 and PHYSID2 registers are
> no longer readable.  Is that expected?

You are making two changes here. Is it the SGMII power down which is
causing the id registers to return 0x, or the BMCR_PDOWN.

The generic suspend code sets the PDOWN bit, so it is assuming the PHY
will respond afterwards.

 Andrew


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

That is a basic assumption of the code. If you cannot read the IDs how
are you supposed to know what device it is, and what quirks you need
to work around its broken features...

Does the datasheet say anything about this?

I would say for this device, suspend() is too aggressive.


This change in my driver makes the problem go away (I'm not sure if it's 
a "fix"):


@@ -992,7 +992,7 @@ int emac_mac_up(struct emac_adapter *adpt)
emac_mac_rx_descs_refill(adpt, >rx_q);

ret = phy_connect_direct(netdev, adpt->phydev, emac_adjust_link,
-PHY_INTERFACE_MODE_SGMII);
+PHY_INTERFACE_MODE_NA);

With the interface not set as SGMII, the following code in 
at803x_suspend() is not executed:


/* also power-down SGMII interface */
ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr & ~AT803X_BT_BX_REG_SEL);
phy_write(phydev, MII_BMCR, phy_read(phydev, MII_BMCR) | BMCR_PDOWN);
phy_write(phydev, AT803X_REG_CHIP_CONFIG, ccr | AT803X_BT_BX_REG_SEL);

I don't see any other driver issue BMCR_PDOWN in their functions.  I 
added some printks for the PHYSID1 and PHYSID2 registers before and 
after BMCR_PDOWN:


at803x_suspend:235 MII_PHYSID1=004d MII_PHYSID2=d074
at803x_suspend:242 MII_PHYSID1= MII_PHYSID2=

So after calling BMCR_PDOWN, the PHYSID1 and PHYSID2 registers are no 
longer readable.  Is that expected?


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


RE: bug in ixgbe_atr

2016-10-14 Thread Duyck, Alexander H


> -Original Message-
> From: Sowmini Varadhan [mailto:sowmini.varad...@oracle.com]
> Sent: Thursday, October 13, 2016 8:49 PM
> To: Duyck, Alexander H 
> Cc: netdev@vger.kernel.org
> Subject: Re: bug in ixgbe_atr
> 
> On (10/14/16 02:06), Duyck, Alexander H wrote:
> > > + case ETH_P_IP:
> > > + skb_header_pointer(skb, ETH_HLEN, sizeof (struct iphdr),
> > > +_hdr);
> > >   /* access ihl as u8 to avoid unaligned access on ia64 */
> > > - hlen = (hdr.network[0] & 0x0F) << 2;
> > > - l4_proto = hdr.ipv4->protocol;
> > > + hlen = ip_hdr.ipv4.ihl << 2;
> > > + l4_proto = ip_hdr.ipv4.protocol;
> > >   break;
>   :
> > The problem is this will break other stuff, for example I have seen
> > the ihl access actually cause problems with unaligned accesses as some
> > architectures decide to pull it as a u32 and then mask it.
> 
> Yes, I noticed that u8 comment for ia64.. if that's the only issue here, we 
> could
> just reset hdr.network to _hdr..
> 
> However, I suspect the above patch is probably not going to work for the vlan
> case (it was just a first-pass hack)

I kind of figured that.  Ideally we only wat to pick out the pieces we need.  I 
would prefer to avoid skb_header_pointer if possible since we only need a few 
parts of the header and don't really need to copy the whole thing.

> > My advice would be to keep this simple.  Add a check to make sure we
> > have room for at least skb_headlen(skb) - 40  >= hrd.raw - skb->data.
> 
> I don't parse that- the hdr union in ixgbe_atr doesnt have a ->raw field. Can 
> you
> explain?

Sorry I was thinking of a different piece of code.  In the case of the atr code 
it would be hdr.network, not hdr.raw.  Basically the thought was to validate 
that there is enough data in skb_headlen() that we can verify that from where 
the network header should be we have at least 40 bytes of data as that would be 
the minimum needed for a TCP header and an IPv4 header, or just an IPv6 header. 
 We would probably need a separate follow-up for the TCP header after we 
validate network header.

> > Messing with the protocol bits will break stuff since there is support
> > for tunneling also floating around in here now.
> >
> > I believe we are planning on dropping this code in favor of
> > ndo_rx_flow_steer in the future.  If we do that then the whole problem
> > becomes moot.
> 
> Dropping it is fine with me I guess - maybe just return, if the
> skb_headlen() doesnt have enough bytes for a network header, i.e., skb_headlen
> is at least ETH_HLEN + sizeof (struct iphdr) for ETH_P_IP, or  ETH_HLEN + 
> sizeof
> (struct ipv6hdr) for ETH_P_IPV6?
> 
> --Sowmini

Right that is kind of what I was thinking.  If we validate that we have at 
least 40 before inspecting the network header, and at least 20 before we 
validate the TCP header that would work for me.

- Alex


Re: [PATCH net-next v2 0/6] FUJITSU Extended Socket driver version 1.2

2016-10-14 Thread David Miller
From: Taku Izumi 
Date: Fri, 14 Oct 2016 20:25:44 +0900

> This patchset updates FUJITSU Extended Socket network driver into version 1.2.
> This includes the following enhancements:
>   - ethtool -d support
>   - ethtool -S enhancement
>   - ethtool -w/-W support
>   - Add some debugging feature (tracepoints etc)
> 
> v1 -> v2:
>   - Use u64 instead of phys_addr_t as TP_STRUCT__entry
>   - Use ethtool facility to achieve debug mode instead of using debugfs

Series applied, thanks.


Re: [PATCH v3 net-next 0/7] qed*: driver updates

2016-10-14 Thread David Miller
From: Manish Chopra 
Date: Fri, 14 Oct 2016 05:19:16 -0400

> There are several new additions in this series;
> Most are connected to either Tx offloading or Rx classifications
> [either fastpath changes or supporting configuration].
> 
> In addition, there's a single IOV enhancement.
> 
> Please consider applying this series to `net-next'.
> 
> V2->V3:
> Fixes below kbuild warning
> call to '__compiletime_assert_60' declared with
> attribute error: Need native word sized stores/loads for atomicity.
> 
> V1->V2:
> Added a fix for the race in ramrod handling
> pointed by Eric Dumazet [patch 7].

Series applied, thanks.


Re: [PATCH v2] net: Require exact match for TCP socket lookups if dif is l3mdev

2016-10-14 Thread David Ahern
On 10/14/16 6:21 AM, David Ahern wrote:
>> So you might need to let the caller pass IP6CB(skb)->flags (or
>> TCP_SKB_CB(skb)->header.h6.flags ) instead of skb since
>> inet6_exact_dif_match() does not know where to fetch the flags.
>>
>> Same issue for IPv4.
> 
> I'll update the match functions to pull from TCP_SKB_CB instead of IP6CB and 
> make a note of the above.

IPv6 does the move after the socket lookup where IPv4 does it before.



Re: [PATCH net-next 1/2] lwtunnel: Add destroy state operation

2016-10-14 Thread David Miller
From: Tom Herbert 
Date: Thu, 13 Oct 2016 17:57:42 -0700

> @@ -130,6 +130,19 @@ int lwtunnel_build_state(struct net_device *dev, u16 
> encap_type,
>  }
>  EXPORT_SYMBOL(lwtunnel_build_state);
>  
> +void  lwtstate_free(struct lwtunnel_state *lws)

There should only be one space between "void" and "lwstate_free".


Re: [PATCH trivial] net: add bbr to config DEFAULT_TCP_CONG

2016-10-14 Thread David Miller
From: Markus Trippelsdorf 
Date: Fri, 14 Oct 2016 10:07:16 +0200

> On 2016.10.14 at 09:43 +0200, Eric Dumazet wrote:
>> On Fri, 2016-10-14 at 09:33 +0200, Markus Trippelsdorf wrote:
>> > While playing with BBR I noticed that it was missing in the list of
>> > possible config DEFAULT_TCP_CONG choices. Fixed thusly.
>> > 
>> > Signed-off-by: Markus Trippelsdorf 
>> > 
>> > diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
>> > index 300b06888fdf..b54b3ca939db 100644
>> > --- a/net/ipv4/Kconfig
>> > +++ b/net/ipv4/Kconfig
>> > @@ -715,6 +715,7 @@ config DEFAULT_TCP_CONG
>> >default "reno" if DEFAULT_RENO
>> >default "dctcp" if DEFAULT_DCTCP
>> >default "cdg" if DEFAULT_CDG
>> > +  default "bbr" if DEFAULT_BBR
>> >default "cubic"
>> 
>> Not sure if we want this at this moment.
>> 
>> BBR needs FQ packet scheduler, and this is not exactly trivial to
>> achieve.
> 
> For a start, it could be automatically selected:

Right but FQ has to be properly enabled and configured as well.


Re: [PATCH net 0/3] qed: Fix dependencies and warnings series

2016-10-14 Thread David Miller
From: Yuval Mintz 
Date: Thu, 13 Oct 2016 22:57:00 +0300

> The first patch in this series follows Dan Carpenter's reports about
> Smatch warnings for recent qed additions and fixes those.
> 
> The second patch is the most significant one [and the reason this is
> ntended for 'net'] - it's based on Arnd Bermann's suggestion for fixing
> compilation issues that were introduced with the roce addition as a result
> of certain combinations of qed, qede and qedr Kconfig options.
> 
> The third follows the discussion with Arnd and clears a lot of the warnings
> that arise when compiling the drivers with "C=1".
> 
> Please consider applying this series to 'net'.

Series applied, thanks.


Re: [PATCH net] net/mlx4_en: fixup xdp tx irq to match rx

2016-10-14 Thread David Miller
From: Brenden Blanco 
Date: Thu, 13 Oct 2016 13:13:11 -0700

> In cases where the number of tx rings is not a multiple of the number of
> rx rings, the tx completion event will be handled on a different core
> from the transmit and population of the ring. Races on the ring will
> lead to a double-free of the page, and possibly other corruption.
> 
> The rings are initialized by default with a valid multiple of rings,
> based on the number of cpus, therefore an invalid configuration requires
> ethtool to change the ring layout. For instance 'ethtool -L eth0 rx 9 tx
> 8' will cause packets received on rx0, and XDP_TX'd to tx48, to be
> completed on cpu3 (48 % 9 == 3).
> 
> Resolve this discrepancy by shifting the irq for the xdp tx queues to
> start again from 0, modulo rx_ring_num.
> 
> Fixes: 9ecc2d86171a ("net/mlx4_en: add xdp forwarding and data write support")
> Reported-by: Jesper Dangaard Brouer 
> Signed-off-by: Brenden Blanco 

Applied and queued up for -stable, thanks.


Re: [PATCH net-next 0/5] udp: Flow dissection for tunnels

2016-10-14 Thread David Miller
From: Tom Herbert 
Date: Thu, 13 Oct 2016 12:29:46 -0700

> On Thu, Oct 13, 2016 at 12:17 PM, David Miller  wrote:
>> This socket lookup is very heavy handed, and I realize that you
>> need this because we no longer store the encapsulation socket in
>> skb->sk these days.
>>
> I don't quite understand your point about the encapsulation socket.

On the transmit side we used to have an issue wrt. what socket lives
on skb->sk when encapsulation is involved.

The problem is that we need skb->sk to be the transport level socket,
but in the output path we used to make tests on "skb->sk" to determine
things such as the multicast loopback flag.  But we should be looking
at the tunnel socket for that, otherwise we could do crazy things
like dereference an AF_PACKET socket as if it were an inet socket one.

As such we modified the output path to pass the inner tunnel socket
'sk' down through the call chain, as an argument to functions such as
ip_queue_xmit(), ip_local_out*(), etc.


Re: [PATCH net-next v11 1/1] net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.

2016-10-14 Thread David Miller
From: "Allan W. Nielsen" 
Date: Thu, 13 Oct 2016 20:21:30 +0200

> Edge-Rate cleanup include the following:
> - Updated device tree bindings documentation for edge-rate
> - The edge-rate is now specified as a "slowdown", meaning that it is now
>   being specified as positive values instead of negative (both
>   documentation and implementation wise).
> - Only explicitly documented values for "vsc8531,vddmac" and
>   "vsc8531,edge-slowdown" are accepted by the device driver.
> - Deleted include/dt-bindings/net/mscc-phy-vsc8531.h as it was not needed.
> - Read/validate devicetree settings in probe instead of init
> 
> Signed-off-by: Allan W. Nielsen 
> Signed-off-by: Raju Lakkaraju 

This patch does not apply to the net-next tree.

Take my tree, put this email of your's into a file, and run this:

bash$ git am file

and you will get:

[davem@dhcp-10-15-49-210 net-next]$ git am --signoff 
net-next-v11-1-1-net-phy-Cleanup-the-Edge-Rate-feature-in-Microsemi-PHYs..patch 
Applying: net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.
error: patch failed: 
Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt:6
error: Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt: patch does 
not apply
error: patch failed: drivers/net/phy/mscc.c:12
error: drivers/net/phy/mscc.c: patch does not apply
error: include/dt-bindings/net/mscc-phy-vsc8531.h: does not exist in index
Patch failed at 0001 net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.
The copy of the patch that failed is found in:
   /home/davem/src/GIT/net-next/.git/rebase-apply/patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

Please do not resubmit this patch until you can successfully email the
patch to yourself and apply it cleanly to the net-next tree.




Re: [PATCH 2/2] IPv6: fix DESYNC_FACTOR

2016-10-14 Thread David Miller
From: Jiri Bohac 
Date: Thu, 13 Oct 2016 18:52:15 +0200

> The IPv6 temporary address generation uses a variable called DESYNC_FACTOR
> to prevent hosts updating the addresses at the same time. Quoting RFC 4941:
> 
>... The value DESYNC_FACTOR is a random value (different for each
>client) that ensures that clients don't synchronize with each other and
>generate new addresses at exactly the same time ...
> 
> DESYNC_FACTOR is defined as:
> 
>DESYNC_FACTOR -- A random value within the range 0 - MAX_DESYNC_FACTOR.
>It is computed once at system start (rather than each time it is used)
>and must never be greater than (TEMP_VALID_LIFETIME - REGEN_ADVANCE).
> 
> First, I believe the RFC has a typo in it and meant to say: "and must
> never be greater than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE)"
> 
> The reason is that at various places in the RFC, DESYNC_FACTOR is used in
> a calculation like (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR) or
> (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE - DESYNC_FACTOR). It needs to be
> smaller than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE) for the result of
> these calculations to be larger than zero. It's never used in a
> calculation together with TEMP_VALID_LIFETIME.
> 
> I already submitted an errata to the rfc-editor:
> https://www.rfc-editor.org/errata_search.php?rfc=4941
> 
> The Linux implementation of DESYNC_FACTOR is very wrong:
> max_desync_factor is used in places DESYNC_FACTOR should be used.
> max_desync_factor is initialized to the RFC-recommended value for
> MAX_DESYNC_FACTOR (600) but the whole point is to get a _random_ value.
> 
> And nothing ensures that the value used is not greater than
> (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE), which leads to underflows.  The
> effect can easily be observed when setting the temp_prefered_lft sysctl
> e.g. to 60. The preferred lifetime of the temporary addresses will be
> bogus.
> 
> TEMP_PREFERRED_LIFETIME and REGEN_ADVANCE are not constants and can be
> influenced by these three sysctls: regen_max_retry, dad_transmits and
> temp_prefered_lft. Thus, the upper bound for desync_factor needs to be
> re-calculated each time a new address is generated and if desync_factor is
> larger than the new upper bound, a new random value needs to be
> re-generated.
> 
> And since we already have max_desync_factor configurable per interface, we
> also need to calculate and store desync_factor per interface.
> 
> Signed-off-by: Jiri Bohac 

Applied.


Re: [PATCH 1/2] IPv6: Drop the temporary address regen_timer

2016-10-14 Thread David Miller
From: Jiri Bohac 
Date: Thu, 13 Oct 2016 18:50:02 +0200

> The randomized interface identifier (rndid) was periodically updated from
> the regen_timer timer. Simplify the code by updating the rndid only when
> needed by ipv6_try_regen_rndid().
> 
> This makes the follow-up DESYNC_FACTOR fix much simpler.  Also it fixes a
> reference counting error in this error path, where an in6_dev_put was
> missing:
>   err = addrconf_sysctl_register(ndev);
>   if (err) {
>   ipv6_mc_destroy_dev(ndev);
>   -   del_timer(>regen_timer);
>   snmp6_unregister_dev(ndev);
>   goto err_release;
> 
> Signed-off-by: Jiri Bohac 

Applied.


Re: [PATCH v3] IB/ipoib: move back IB LL address into the hard header

2016-10-14 Thread David Miller
From: Paolo Abeni 
Date: Thu, 13 Oct 2016 18:26:56 +0200

> After the commit 9207f9d45b0a ("net: preserve IP control block
> during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
> That destroy the IPoIB address information cached there,
> causing a severe performance regression, as better described here:
> 
> http://marc.info/?l=linux-kernel=146787279825501=2
> 
> This change moves the data cached by the IPoIB driver from the
> skb control lock into the IPoIB hard header, as done before
> the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len
> and use skb->cb to stash LL addresses").
> In order to avoid GRO issue, on packet reception, the IPoIB driver
> stash into the skb a dummy pseudo header, so that the received
> packets have actually a hard header matching the declared length.
> To avoid changing the connected mode maximum mtu, the allocated
> head buffer size is increased by the pseudo header length.
> 
> After this commit, IPoIB performances are back to pre-regression
> value.
> 
> v2 -> v3: rebased
> v1 -> v2: avoid changing the max mtu, increasing the head buf size
> 
> Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
> Signed-off-by: Paolo Abeni 

Applied and queued up for -stable, thanks.


Re: [PATCH v3] IB/ipoib: move back IB LL address into the hard header

2016-10-14 Thread Paolo Abeni
On Fri, 2016-10-14 at 13:23 +0300, Or Gerlitz wrote:
> 
> Paolo,
> 
> Is this fix backportable to any kernel since the breakage? 

yes, AFAIK this is beck-portable.

> AFAIR,
> Roland mentioned
> that a 2nd change introduced in 4.7-rc1 changed things a bit more such
> that the fix
> he had in his head didn't apply any more.

If you refer to shrinking both IPoIB and gso control buffer, as proposed
by Roland in:

http://marc.info/?l=linux-kernel=146787279825501=2

that will not work, since the gso control buffer is grown a bit since
the first time the bug was detected.

This patch does not have that sort of issue.

> I am still travelling after netdev and would like to put an eye on the
> patch and also see that @mellanox.com someone

Not sure if that helps, but a 3rd party has already confirmed privately
that this patch fixes the bug for them.

Paolo




Re: [PATCH net-next 0/4] rxrpc: Fixes

2016-10-14 Thread David Miller
From: David Howells 
Date: Thu, 13 Oct 2016 17:12:09 +0100

> 
> This set of patches contains a bunch of fixes:
> 
>  (1) Fix use of kunmap() after change from kunmap_atomic() within AFS.
> 
>  (2) Don't use of ERR_PTR() with an always zero value.
> 
>  (3) Check the right error when using ip6_route_output().
> 
>  (4) Be consistent about whether call->operation_ID is BE or CPU-E within
>  AFS.
> 
> The patches can be found here also:
> 
>   
> http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite
> 
> Tagged thusly:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
>   rxrpc-rewrite-20161013

Pulled, thanks David.


Re: [PATCH net-next] Documentation/networking: update git urls to use https over http

2016-10-14 Thread David Miller
From: Alexander Alemayhu 
Date: Thu, 13 Oct 2016 17:09:51 +0200

> This fixes the following errors when trying to clone the urls:
> 
> Cloning into 'net'...
> fatal: repository 
> 'http://git.kernel.org/cgit/linux/kernel/git/davem/net.git/' not found
> Cloning into 'net-next'...
> fatal: repository 
> 'http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/' not found
> Cloning into 'linux'...
> fatal: repository 
> 'http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/' not found
> Cloning into 'stable-queue'...
> fatal: repository 
> 'http://git.kernel.org/cgit/linux/kernel/git/stable/stable-queue.git/' not 
> found
> 
> Signed-off-by: Alexander Alemayhu 

Since this is simply a documentation fix I applied this to 'net', thanks.


Re: [PATCH v4 net-next 0/4] act_mirred: Ingress actions support

2016-10-14 Thread David Miller
From: Shmulik Ladkani 
Date: Thu, 13 Oct 2016 09:06:40 +0300

> This patch series implements action mirred 'ingress' actions
> TCA_INGRESS_REDIR and TCA_INGRESS_MIRROR.
> 
> This allows attaching filters whose target is to hand matching skbs into
> the rx processing of a specified device.
> 
> v4:
>   in 4/4, check ret code of netif_receive_skb, as suggested by Cong Wang
> v3:
>   in 4/4, addressed non coherency due to reading m->tcfm_eaction multiple
>   times, as spotted by Eric Dumazet
> v2:
>   in 1/4, declare tcfm_mac_header_xmit as bool instead of int

Series applied, thanks.


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Ard Biesheuvel

> On 14 Oct 2016, at 14:46, Johannes Berg  wrote:
> 
> 
>> 
>> Is the aad[] actually reused? I would assume it only affects the mac
>> on encryption, and the verification on decryption but I don't think
>> we actually need it back from the crypto routines.
> 
> I don't think it's reused.
> 
>> Exactly what you said above :-) My patch only touches CCM but as you
>> said,
>> 
>> """
>> 'Also there's B_0/J_0 for CCM/GCM, and the 'zero' thing that GMAC
>> has.
>> """
> 
> Ah, but we can/should do the same for the others, no?
> 

Yes, but then we end up kmalloc/kfreeing chunks of 16 bytes, which is actually 
another problem.

I still think we are not violating the api by putting aead_req on the stack 
(but herbert should confirm). The aad[] issue does violate the api, so it 
deserves a separate fix imo

Re: [PATCH net-next 00/11] net: Fix netdev adjacency tracking

2016-10-14 Thread David Miller
From: David Ahern 
Date: Wed, 12 Oct 2016 13:51:48 -0700

> The netdev adjacency tracking is failing to create proper dependencies
> for some topologies. For example this topology
> 
> ++
> |  myvrf |
> ++
>   ||
>   |  +-+
>   |  | macvlan |
>   |  +-+
>   ||
>   +--+
>   |  bridge  |
>   +--+
>   |
>   ++
>   | bond0  |
>   ++
>   |
>   ++
>   |  eth3  |
>   ++
> 
> hits 1 of 2 problems depending on the order of enslavement. The base set of
> commands for both cases:
> 
> ip link add bond1 type bond
> ip link set bond1 up

Your diagram uses the device name "bond0" but all of your command examples
use "bond1".  Please fix this up, thanks.


Re: [PATCH 2/2] net: wan: slic_ds26522: Export OF module alias information

2016-10-14 Thread David Miller
From: Javier Martinez Canillas 
Date: Wed, 12 Oct 2016 15:55:41 -0300

> When the device is registered via OF, the OF table is used to match the
> driver instead of the SPI device ID table, but the entries in the later
> are used as aliasses to load the module if the driver was not built-in.
> 
> This is because the SPI core always reports an SPI module alias instead
> of an OF one, but that could change so it's better to always export it.
> 
> Signed-off-by: Javier Martinez Canillas 

Applied.


Re: [PATCH 1/2] net: wan: slic_ds26522: add SPI device ID table to fix module autoload

2016-10-14 Thread David Miller
From: Javier Martinez Canillas 
Date: Wed, 12 Oct 2016 15:55:40 -0300

> If the driver is built as a module, module alias information isn't filled
> so the module won't be autoloaded. Add a SPI device ID table and use the
> MODULE_DEVICE_TABLE() macro so the information is exported in the module.
> 
> Before this patch:
> 
> $ modinfo drivers/net/wan/slic_ds26522.ko | grep alias
> $
> 
> After this patch:
> 
> $ modinfo drivers/net/wan/slic_ds26522.ko | grep alias
> alias:  spi:ds26522
> 
> Signed-off-by: Javier Martinez Canillas 

Applied.


Re: [PATCH] net: wan: slic_ds26522: Allow driver to built if COMPILE_TEST is enabled

2016-10-14 Thread David Miller
From: Javier Martinez Canillas 
Date: Wed, 12 Oct 2016 16:05:59 -0300

> The driver only has runtime but no build time dependency with FSL_SOC ||
> ARCH_MXC || ARCH_LAYERSCAPE.  So it can be built for testing purposes if
> the COMPILE_TEST option is enabled.
> 
> This is useful to have more build coverage and make sure that the driver
> is not affected by changes that could cause build regressions.
> 
> Signed-off-by: Javier Martinez Canillas 

Applied.


Re: [PATCH net] ipv6: correctly add local routes when lo goes up

2016-10-14 Thread David Miller
From: Nicolas Dichtel 
Date: Wed, 12 Oct 2016 10:10:40 +0200

> The goal of the patch is to fix this scenario:
>  ip link add dummy1 type dummy
>  ip link set dummy1 up
>  ip link set lo down ; ip link set lo up
> 
> After that sequence, the local route to the link layer address of dummy1 is
> not there anymore.
> 
> When the loopback is set down, all local routes are deleted by
> addrconf_ifdown()/rt6_ifdown(). At this time, the rt6_info entry still
> exists, because the corresponding idev has a reference on it. After the rcu
> grace period, dst_rcu_free() is called, and thus ___dst_free(), which will
> set obsolete to DST_OBSOLETE_DEAD.
> 
> In this case, init_loopback() is called before dst_rcu_free(), thus
> obsolete is still sets to something <= 0. So, the function doesn't add the
> route again. To avoid that race, let's check the rt6 refcnt instead.
> 
> Fixes: 25fb6ca4ed9c ("net IPv6 : Fix broken IPv6 routing table after loopback 
> down-up")
> Fixes: a881ae1f625c ("ipv6: don't call addrconf_dst_alloc again when enable 
> lo")
> Fixes: 33d99113b110 ("ipv6: reallocate addrconf router for ipv6 address when 
> lo device up")
> Reported-by: Francesco Santoro 
> Reported-by: Samuel Gauthier 
 ...
> Signed-off-by: Nicolas Dichtel 

Applied and queued up for -stable, thanks.


Re: [PATCH] net: limit a number of namespaces which can be cleaned up concurrently

2016-10-14 Thread David Miller
From: ebied...@xmission.com (Eric W. Biederman)
Date: Thu, 13 Oct 2016 22:06:28 -0500

> Oh that is a surprise.  We can definitely skip genenerating uevents for
> network namespaces that are exiting because by definition no one can see
> those network namespaces.  If a socket existed that could see those
> uevents it would hold a reference to the network namespace and as such
> the network namespace could not exit.
> 
> That sounds like it is worth investigating a little more deeply.
> 
> I am surprised that allocation and freeing is so heavy we are spending
> lots of time doing that.  On the other hand kobj_bcast_filter is very
> dumb and very late so I expect something can be moved earlier and make
> that code cheaper with the tiniest bit of work.

I definitely would rather see the uevents removed to kill ~%99 of the
namespace removal overhead rather than limiting.


Re: [PATCH net] Revert "net: Add driver helper functions to determine checksum offloadability"

2016-10-14 Thread David Miller
From: Stephen Hemminger 
Date: Tue, 11 Oct 2016 13:04:09 -0700

> 
> This reverts commit 6ae23ad36253a8033c5714c52b691b84456487c5.
> 
> The code has been in kernel since 4.4 but there are no in tree
> code that uses. Unused code is broken code, remove it.
> 
> Signed-off-by: Stephen Hemminger 

Applied to net-next, thanks Stephen.

Please put proper "[PATCH net-next]" tags in your Subject lines when
patches are targetting net-next.

Thanks.


Re: [PATCH net] ip6_tunnel: fix ip6_tnl_lookup

2016-10-14 Thread David Miller
From: Vadim Fedorenko 
Date: Tue, 11 Oct 2016 22:47:20 +0300

> The commit ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel
> endpoints.") introduces support for wildcards in tunnels endpoints,
> but in some rare circumstances ip6_tnl_lookup selects wrong tunnel
> interface relying only on source or destination address of the packet
> and not checking presence of wildcard in tunnels endpoints. Later in
> ip6_tnl_rcv this packets can be dicarded because of difference in
> ipproto even if fallback device have proper ipproto configuration.
> 
> This patch adds checks of wildcard endpoint in tunnel avoiding such
> behavior
> 
> Fixes: ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel
> endpoints.")
> 
> Signed-off-by: Vadim Fedorenko 

Applied and queued up for -stable.


Re: [PATCH v2] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel

> On 14 Oct 2016, at 14:42, David Laight  wrote:
> 
> From: Of Ard Biesheuvel
>> Sent: 14 October 2016 14:41
>> PCI devices that are 64-bit DMA capable should set the coherent
>> DMA mask as well as the streaming DMA mask. On some architectures,
>> these are managed separately, and so the coherent DMA mask will be
>> left at its default value of 32 if it is not set explicitly. This
>> results in errors such as
>> 
>> r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>> hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>> swiotlb: coherent allocation failed for device :02:00.0 size=4096
>> CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>> Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
>> 
>> on systems without memory that is 32-bit addressable by PCI devices.
>> 
>> Signed-off-by: Ard Biesheuvel 
>> ---
>> v2: dropped the hunk that sets the coherent DMA mask to DMA_BIT_MASK(32),
>>which is unnecessary given that it is the default
>> 
>> drivers/net/ethernet/realtek/r8169.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/net/ethernet/realtek/r8169.c 
>> b/drivers/net/ethernet/realtek/r8169.c
>> index e55638c7505a..bf000d819a21 100644
>> --- a/drivers/net/ethernet/realtek/r8169.c
>> +++ b/drivers/net/ethernet/realtek/r8169.c
>> @@ -8273,7 +8273,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
>> struct pci_device_id *ent)
>>if ((sizeof(dma_addr_t) > 4) &&
>>(use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
>>  tp->mac_version >= RTL_GIGA_MAC_VER_18)) &&
>> -!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
>> +!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
>> +!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
> 
> Isn't there a dma_set_mask_and_coherent() function ?
> 

Not of the pci_xxx variety afaik

Re: [PATCH v2] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel


> On 14 Oct 2016, at 14:42, David Laight wrote:
> 
> From: Of Ard Biesheuvel
>> Sent: 14 October 2016 14:41
>> PCI devices that are 64-bit DMA capable should set the coherent
>> DMA mask as well as the streaming DMA mask. On some architectures,
>> these are managed separately, and so the coherent DMA mask will be
>> left at its default value of 32 if it is not set explicitly. This
>> results in errors such as
>> 
>> r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>> hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>> swiotlb: coherent allocation failed for device :02:00.0 size=4096
>> CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>> Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
>> 
>> on systems without memory that is 32-bit addressable by PCI devices.
>> 
>> Signed-off-by: Ard Biesheuvel 
>> ---
>> v2: dropped the hunk that sets the coherent DMA mask to DMA_BIT_MASK(32),
>>which is unnecessary given that it is the default
>> 
>> drivers/net/ethernet/realtek/r8169.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/net/ethernet/realtek/r8169.c 
>> b/drivers/net/ethernet/realtek/r8169.c
>> index e55638c7505a..bf000d819a21 100644
>> --- a/drivers/net/ethernet/realtek/r8169.c
>> +++ b/drivers/net/ethernet/realtek/r8169.c
>> @@ -8273,7 +8273,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
>> struct pci_device_id *ent)
>>if ((sizeof(dma_addr_t) > 4) &&
>>(use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
>>  tp->mac_version >= RTL_GIGA_MAC_VER_18)) &&
>> -!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
>> +!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
>> +!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
> 
> Isn't there a dma_set_mask_and_coherent() function ?
> 
>David
> 


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Johannes Berg

> 
> Is the aad[] actually reused? I would assume it only affects the mac
> on encryption, and the verification on decryption but I don't think
> we actually need it back from the crypto routines.

I don't think it's reused.

> Exactly what you said above :-) My patch only touches CCM but as you
> said,
> 
> """
> 'Also there's B_0/J_0 for CCM/GCM, and the 'zero' thing that GMAC
> has.
> """

Ah, but we can/should do the same for the others, no?

johannes


RE: [PATCH v2] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread David Laight
From: Of Ard Biesheuvel
> Sent: 14 October 2016 14:41
> PCI devices that are 64-bit DMA capable should set the coherent
> DMA mask as well as the streaming DMA mask. On some architectures,
> these are managed separately, and so the coherent DMA mask will be
> left at its default value of 32 if it is not set explicitly. This
> results in errors such as
> 
>  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>  swiotlb: coherent allocation failed for device :02:00.0 size=4096
>  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
> 
> on systems without memory that is 32-bit addressable by PCI devices.
> 
> Signed-off-by: Ard Biesheuvel 
> ---
> v2: dropped the hunk that sets the coherent DMA mask to DMA_BIT_MASK(32),
> which is unnecessary given that it is the default
> 
>  drivers/net/ethernet/realtek/r8169.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/realtek/r8169.c 
> b/drivers/net/ethernet/realtek/r8169.c
> index e55638c7505a..bf000d819a21 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -8273,7 +8273,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
>   if ((sizeof(dma_addr_t) > 4) &&
>   (use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
> tp->mac_version >= RTL_GIGA_MAC_VER_18)) &&
> - !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
> + !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
> + !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {

Isn't there a dma_set_mask_and_coherent() function ?

David



Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Johannes Berg
On Fri, 2016-10-14 at 14:13 +0100, Ard Biesheuvel wrote:
> 
> > But if we allocate things anyway, is it worth expending per-CPU
> > buffers on these?
> 
> Ehmm, maybe not. I could spin a v2 that allocates a bigger buffer,
> and copies aad[] into it as well

Copies in/out, I guess. Also there's B_0/J_0 for CCM/GCM, and the
'zero' thing that GMAC has.

> That does not help the other algos though

What do you mean?

johannes


[PATCH v2] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel
PCI devices that are 64-bit DMA capable should set the coherent
DMA mask as well as the streaming DMA mask. On some architectures,
these are managed separately, and so the coherent DMA mask will be
left at its default value of 32 if it is not set explicitly. This
results in errors such as

 r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
 hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
 swiotlb: coherent allocation failed for device :02:00.0 size=4096
 CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
 Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016

on systems without memory that is 32-bit addressable by PCI devices.

Signed-off-by: Ard Biesheuvel 
---
v2: dropped the hunk that sets the coherent DMA mask to DMA_BIT_MASK(32),
which is unnecessary given that it is the default

 drivers/net/ethernet/realtek/r8169.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index e55638c7505a..bf000d819a21 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8273,7 +8273,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
if ((sizeof(dma_addr_t) > 4) &&
(use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
  tp->mac_version >= RTL_GIGA_MAC_VER_18)) &&
-   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
+   !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
 
/* CPlusCmd Dual Access Cycle is only needed for non-PCIe */
if (!pci_is_pcie(pdev))
-- 
2.7.4



Re: [PATCH] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel
On 14 October 2016 at 14:34, David Miller  wrote:
> From: Ard Biesheuvel 
> Date: Fri, 14 Oct 2016 14:32:24 +0100
>
>> On 14 October 2016 at 14:31, David Miller  wrote:
>>> From: Ard Biesheuvel 
>>> Date: Fri, 14 Oct 2016 12:39:30 +0100
>>>
 PCI devices that are 64-bit DMA capable should set the coherent
 DMA mask as well as the streaming DMA mask. On some architectures,
 these are managed separately, and so the coherent DMA mask will be
 left at its default value of 32 if it is not set explicitly. This
 results in errors such as

  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
  swiotlb: coherent allocation failed for device :02:00.0 size=4096
  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016

 on systems without memory that is 32-bit addressable by PCI devices.

 Signed-off-by: Ard Biesheuvel 
>>>  ...
 @@ -8281,6 +8282,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
 struct pci_device_id *ent)
   dev->features |= NETIF_F_HIGHDMA;
   } else {
   rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 + if (!rc)
 + rc = pci_set_consistent_dma_mask(pdev, 
 DMA_BIT_MASK(32));
>>>
>>> As you state 32-bit is the default, therefore this part of your patch is 
>>> unnecessary.
>>
>> Perhaps, but the original code did not assume that either. Should I
>> remove the other call in a subsequent patch as well?
>
> I simply want you to respin this with the above hunk removed.
>
> Your code changes and your commit message must be consistent.

OK, fair enough


Re: [PATCH] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel
On 14 October 2016 at 14:31, David Miller  wrote:
> From: Ard Biesheuvel 
> Date: Fri, 14 Oct 2016 12:39:30 +0100
>
>> PCI devices that are 64-bit DMA capable should set the coherent
>> DMA mask as well as the streaming DMA mask. On some architectures,
>> these are managed separately, and so the coherent DMA mask will be
>> left at its default value of 32 if it is not set explicitly. This
>> results in errors such as
>>
>>  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>>  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>>  swiotlb: coherent allocation failed for device :02:00.0 size=4096
>>  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>>  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
>>
>> on systems without memory that is 32-bit addressable by PCI devices.
>>
>> Signed-off-by: Ard Biesheuvel 
>  ...
>> @@ -8281,6 +8282,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
>> struct pci_device_id *ent)
>>   dev->features |= NETIF_F_HIGHDMA;
>>   } else {
>>   rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
>> + if (!rc)
>> + rc = pci_set_consistent_dma_mask(pdev, 
>> DMA_BIT_MASK(32));
>
> As you state 32-bit is the default, therefore this part of your patch is 
> unnecessary.

Perhaps, but the original code did not assume that either. Should I
remove the other call in a subsequent patch as well?


Re: [PATCH] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread David Miller
From: Ard Biesheuvel 
Date: Fri, 14 Oct 2016 14:32:24 +0100

> On 14 October 2016 at 14:31, David Miller  wrote:
>> From: Ard Biesheuvel 
>> Date: Fri, 14 Oct 2016 12:39:30 +0100
>>
>>> PCI devices that are 64-bit DMA capable should set the coherent
>>> DMA mask as well as the streaming DMA mask. On some architectures,
>>> these are managed separately, and so the coherent DMA mask will be
>>> left at its default value of 32 if it is not set explicitly. This
>>> results in errors such as
>>>
>>>  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>>>  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>>>  swiotlb: coherent allocation failed for device :02:00.0 size=4096
>>>  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>>>  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
>>>
>>> on systems without memory that is 32-bit addressable by PCI devices.
>>>
>>> Signed-off-by: Ard Biesheuvel 
>>  ...
>>> @@ -8281,6 +8282,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
>>> struct pci_device_id *ent)
>>>   dev->features |= NETIF_F_HIGHDMA;
>>>   } else {
>>>   rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
>>> + if (!rc)
>>> + rc = pci_set_consistent_dma_mask(pdev, 
>>> DMA_BIT_MASK(32));
>>
>> As you state 32-bit is the default, therefore this part of your patch is 
>> unnecessary.
> 
> Perhaps, but the original code did not assume that either. Should I
> remove the other call in a subsequent patch as well?

I simply want you to respin this with the above hunk removed.

Your code changes and your commit message must be consistent.


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

Does the datasheet say anything about this?

I would say for this device, suspend() is too aggressive.


I'll have to find the datasheet.  Let me do some research and get back 
to you.  Thanks for your help so far.


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: [PATCH] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread David Miller
From: Ard Biesheuvel 
Date: Fri, 14 Oct 2016 12:39:30 +0100

> PCI devices that are 64-bit DMA capable should set the coherent
> DMA mask as well as the streaming DMA mask. On some architectures,
> these are managed separately, and so the coherent DMA mask will be
> left at its default value of 32 if it is not set explicitly. This
> results in errors such as
> 
>  r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>  hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
>  swiotlb: coherent allocation failed for device :02:00.0 size=4096
>  CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
>  Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016
> 
> on systems without memory that is 32-bit addressable by PCI devices.
> 
> Signed-off-by: Ard Biesheuvel 
 ...
> @@ -8281,6 +8282,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
>   dev->features |= NETIF_F_HIGHDMA;
>   } else {
>   rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
> + if (!rc)
> + rc = pci_set_consistent_dma_mask(pdev, 
> DMA_BIT_MASK(32));

As you state 32-bit is the default, therefore this part of your patch is 
unnecessary.


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Ard Biesheuvel
On 14 October 2016 at 14:15, Johannes Berg  wrote:
> On Fri, 2016-10-14 at 14:13 +0100, Ard Biesheuvel wrote:
>>
>> > But if we allocate things anyway, is it worth expending per-CPU
>> > buffers on these?
>>
>> Ehmm, maybe not. I could spin a v2 that allocates a bigger buffer,
>> and copies aad[] into it as well
>
> Copies in/out, I guess. Also there's B_0/J_0 for CCM/GCM, and the
> 'zero' thing that GMAC has.
>

Is the aad[] actually reused? I would assume it only affects the mac
on encryption, and the verification on decryption but I don't think we
actually need it back from the crypto routines.

>> That does not help the other algos though
>
> What do you mean?
>

Exactly what you said above :-) My patch only touches CCM but as you said,

"""
'Also there's B_0/J_0 for CCM/GCM, and the 'zero' thing that GMAC has.
"""


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Andrew Lunn
On Fri, Oct 14, 2016 at 08:03:18AM -0500, Timur Tabi wrote:
> Andrew Lunn wrote:
> >Have you tried using the ethernet-phy-id device tree property? It
> >looks like that will allow you to skip get_phy_device and just create
> >the phy device. You can then bring the phy out of sleep in the probe
> >function?
> 
> The problem I'm experiencing is with ACPI, so I can't use any of the
> fancy of_ apis like of_get_phy_id().  But I'll look into it.
> 
> Is it possible that at803x_suspend() is too aggressive?  That's it's
> effectively disabling the phy?  While the phy is suspended, should
> it still respond to MII_PHYSID1 and MII_PHYSID2 requests?

That is a basic assumption of the code. If you cannot read the IDs how
are you supposed to know what device it is, and what quirks you need
to work around its broken features...

Does the datasheet say anything about this?

I would say for this device, suspend() is too aggressive.

  Andrew


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Ard Biesheuvel
On 14 October 2016 at 14:10, Johannes Berg  wrote:
>
>> So use kzalloc
>
> Do we really need kzalloc()? We have things on the stack right now, and
> don't initialize, so surely we don't really need to zero things?
>
>> This only addresses one half of the problem. The other problem, i.e.,
>> the fact that the aad[] array lives on the stack of the caller, is
>> handled adequately imo by the change proposed by Johannes.
>
> But if we allocate things anyway, is it worth expending per-CPU buffers
> on these?
>

Ehmm, maybe not. I could spin a v2 that allocates a bigger buffer, and
copies aad[] into it as well
That does not help the other algos though


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Johannes Berg
On Fri, 2016-10-14 at 15:10 +0200, Johannes Berg wrote:
> > 
> > So use kzalloc
> 
> Do we really need kzalloc()? We have things on the stack right now,
> and don't initialize, so surely we don't really need to zero things? 

Err, never mind, I'm an idiot - we *do* initialize to 0, of course.

johannes


Re: [PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Johannes Berg

> So use kzalloc

Do we really need kzalloc()? We have things on the stack right now, and
don't initialize, so surely we don't really need to zero things?

> This only addresses one half of the problem. The other problem, i.e.,
> the fact that the aad[] array lives on the stack of the caller, is
> handled adequately imo by the change proposed by Johannes.

But if we allocate things anyway, is it worth expending per-CPU buffers
on these?

johannes


[PATCH] mac80211: aes_ccm: move struct aead_req off the stack

2016-10-14 Thread Ard Biesheuvel
Some CCM implementations (such as the generic CCM wrapper in crypto/)
use scatterlists to map fields of struct aead_req. This means these
data structures cannot live in the vmalloc area, which means that in
the near future, they can no longer live on the stack either.

Given that these data structures have implementation specific context fields,
it really depends on the particular driver whether this issue is likely to
occur or not, and so it seems best to simply move the entire data structure
into the direct mapped kernel heap.

So use kzalloc/kfree to allocate and free the data structures. This pattern
already exists in the IPsec ESP driver, but in the future, we may need to
improve upon this by either moving the request into the SKB, or using a
slab cache to allocate/free the data structures.

Signed-off-by: Ard Biesheuvel 
---

This only addresses one half of the problem. The other problem, i.e., the
fact that the aad[] array lives on the stack of the caller, is handled
adequately imo by the change proposed by Johannes.

 net/mac80211/aes_ccm.c | 24 ++--
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index 7663c28ba353..a0ae8cebbe4e 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -23,13 +23,10 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 
*b_0, u8 *aad,
   size_t mic_len)
 {
struct scatterlist sg[3];
+   struct aead_request *aead_req;
 
-   char aead_req_data[sizeof(struct aead_request) +
-  crypto_aead_reqsize(tfm)]
-   __aligned(__alignof__(struct aead_request));
-   struct aead_request *aead_req = (void *) aead_req_data;
-
-   memset(aead_req, 0, sizeof(aead_req_data));
+   aead_req = kzalloc(sizeof(struct aead_request) +
+  crypto_aead_reqsize(tfm), GFP_ATOMIC);
 
sg_init_table(sg, 3);
sg_set_buf([0], [2], be16_to_cpup((__be16 *)aad));
@@ -41,6 +38,7 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 
*b_0, u8 *aad,
aead_request_set_ad(aead_req, sg[0].length);
 
crypto_aead_encrypt(aead_req);
+   kfree(aead_req);
 }
 
 int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
@@ -48,15 +46,14 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 
*b_0, u8 *aad,
  size_t mic_len)
 {
struct scatterlist sg[3];
-   char aead_req_data[sizeof(struct aead_request) +
-  crypto_aead_reqsize(tfm)]
-   __aligned(__alignof__(struct aead_request));
-   struct aead_request *aead_req = (void *) aead_req_data;
+   struct aead_request *aead_req;
+   int err;
 
if (data_len == 0)
return -EINVAL;
 
-   memset(aead_req, 0, sizeof(aead_req_data));
+   aead_req = kzalloc(sizeof(struct aead_request) +
+  crypto_aead_reqsize(tfm), GFP_ATOMIC);
 
sg_init_table(sg, 3);
sg_set_buf([0], [2], be16_to_cpup((__be16 *)aad));
@@ -67,7 +64,10 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 
*b_0, u8 *aad,
aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0);
aead_request_set_ad(aead_req, sg[0].length);
 
-   return crypto_aead_decrypt(aead_req);
+   err = crypto_aead_decrypt(aead_req);
+   kfree(aead_req);
+
+   return err;
 }
 
 struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[],
-- 
2.7.4



Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

Have you tried using the ethernet-phy-id device tree property? It
looks like that will allow you to skip get_phy_device and just create
the phy device. You can then bring the phy out of sleep in the probe
function?


The problem I'm experiencing is with ACPI, so I can't use any of the 
fancy of_ apis like of_get_phy_id().  But I'll look into it.


Is it possible that at803x_suspend() is too aggressive?  That's it's 
effectively disabling the phy?  While the phy is suspended, should it 
still respond to MII_PHYSID1 and MII_PHYSID2 requests?


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Andrew Lunn
On Fri, Oct 14, 2016 at 07:49:56AM -0500, Timur Tabi wrote:
> Andrew Lunn wrote:
> >So are you seeing that the reads to MII_PHYSID1 and MII_PHYSID2 return
> >0x, when called from get_phy_id()?

Have you tried using the ethernet-phy-id device tree property? It
looks like that will allow you to skip get_phy_device and just create
the phy device. You can then bring the phy out of sleep in the probe
function?

Andrew


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

So are you seeing that the reads to MII_PHYSID1 and MII_PHYSID2 return
0x, when called from get_phy_id()?


Yes.

--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Andrew Lunn
> It's the at803x driver.

The at803x_resume() just does normal MDIO transactions. Which suggests
the MDIO bus side of the device is still away. Or at least, the
MII_BMCR register is.

So are you seeing that the reads to MII_PHYSID1 and MII_PHYSID2 return
0x, when called from get_phy_id()?

Andrew


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

Please can you tell us what PHY which is, and how it is put to sleep
and woken up.


It's the at803x driver.

http://lxr.free-electrons.com/source/drivers/net/phy/at803x.c

It goes to sleep in its at803x_suspend() function, which is called by 
phy_suspend().


There is a corresponding at803x_resume().  The problem is that this is 
not called by mdiobus_register().  I'm guessing that mdiobus_register() 
assumes that the phy is awake.


It seems like a catch-22.  mdiobus_register() assumes that the phy is 
awake, but you can't wake up the phy until after you call 
mdiobus_register().



If the PHY cannot be woken up using MDIO, then maybe you need to look
at the mdio bus reset call?


I looked at that, but it won't work because there is no phydev when the 
reset function is called:


http://lxr.free-electrons.com/source/drivers/net/phy/mdio_bus.c#L328

It's the same catch-22.

--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: [PATCH v2] net: Require exact match for TCP socket lookups if dif is l3mdev

2016-10-14 Thread David Ahern
On 10/14/16 12:33 AM, Eric Dumazet wrote:
> There is a catch here.
> TCP moves IP6CB() in a different location.
> 
> Reference :
> 
> 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")

thanks for the reference.


> Problem is that the lookup can happen from IP early demux, before TCP
> moved IP{6}CB around.

For TCP we only need the exact_dif match for listen sockets, so early demux 
does not apply.

> 
> So you might need to let the caller pass IP6CB(skb)->flags (or
> TCP_SKB_CB(skb)->header.h6.flags ) instead of skb since
> inet6_exact_dif_match() does not know where to fetch the flags.
> 
> Same issue for IPv4.

I'll update the match functions to pull from TCP_SKB_CB instead of IP6CB and 
make a note of the above.

Thanks for the review


Re: [PATCH net-next 1/2] net: phy: Add Speed downshift set driver for Microsemi PHYs.

2016-10-14 Thread Andrew Lunn
On Fri, Oct 14, 2016 at 05:10:32PM +0530, Raju Lakkaraju wrote:
> From: Raju Lakkaraju 
> 
> For operation in cabling environments that are incompatible with
> 1000BAST-T, VSC8531 device provides an automatic link speed
> downshift operation. When enabled, the device automatically changes
> its 1000BAST-T auto-negotiation to the next slower speed after
> a configured number of failed attempts at 1000BAST-T.
> This feature is useful in setting up in networks using older cable
> installations that include only pairs A and B, and not pairs C and D.

Any reason not to just turn this on by default when auto-neg is
enabled?

Andrew


Re: Need help with mdiobus_register and phy

2016-10-14 Thread Andrew Lunn
On Fri, Oct 14, 2016 at 06:38:47AM -0500, Timur Tabi wrote:
> Andrew Lunn wrote:
> >Normally, a sleeping PHY does respond to MDIO. Otherwise, how do you
> >wake it?
> >
> >So i assume this phy has some other means to wake it. What is this
> >means?
> 
> I'm guessing that someone has to call phy_resume() before/during the
> call to mdiobus_register, but I don't see how that's possible.

Please can you tell us what PHY which is, and how it is put to sleep
and woken up.

If the PHY cannot be woken up using MDIO, then maybe you need to look
at the mdio bus reset call?

   Andrew


Re: [PATCH net-next 2/2] net: phy: Add Fast Link Failure - 2 set driver for Microsemi PHYs.

2016-10-14 Thread Andrew Lunn
> On Fri, Oct 14, 2016 at 05:10:33PM +0530, Raju Lakkaraju wrote:
> From: Raju Lakkaraju 
> 
> VSC8531 Fast Link Failure 2 feature enables the PHY to indicate the
> onset of a potential link failure in < 100 usec for 100BASE-TX
> operation. FLF2 is supported through the MDINT (active low) pin.

Is the MDINT pin specific to this feature, or a general interrupt pin?

Device tree is used to describe the hardware. It should not really
describe software or configuration. But the borders are a bit
fluffly. Signal edge rates is near to hardware. This is a lot more
towards configuration. So i'm not sure a device tree property is the
correct way to describe this.

This is also a feature i know other PHYs support. The Marvell PHY has
a "Metro Ethernet" extension which allows it to report link failures
for 1000BASE-T in 10, 20 or 40ms, instead of the usual 750ms. So we
need a generic solution other PHYs can implement.

As with cable testing, i think it should be an ethtool option.

   Andrew


[PATCH net-next 0/2] net: phy: Add Downshift, FLF2 drivers for Microsemi

2016-10-14 Thread Raju Lakkaraju

From: Raju Lakkaraju 

This series adds support to the Speed downshift, Fast Link Failure 2,
set drivers for Microsemi PHYs.

Patch 1/4: Link Speed downshift:
For operation in cabling environments that are incompatible with
1000BAST-T, VSC8531 device provides an automatic link speed
downshift operation. When enabled, the device automatically changes
its 1000BAST-T auto-negotiation to the next slower speed after
a set number of failed attempts at 1000BAST-T.
This feature is useful in setting up in networks using older cable
installations that include only pairs A and B, and not pairs C and D.

Patch 2/4: Fast Link Failure 2:
VSC8531 Fast Link Failure 2 feature enables the PHY to indicate the
onset of a potential link failure in < 100 usec for 100BASE-TX
operation. FLF2 is supported through the MDINT (active low) pin.

All these features tested on Beaglebone Black with VSC 8531 PHY.

Raju Lakkaraju (2):
  net: phy: Add Speed downshift set driver for Microsemi PHYs.
  net: phy: Add Fast Link Failure - 2 set driver for Microsemi PHYs.

 .../devicetree/bindings/net/mscc-phy-vsc8531.txt   |  12 +++
 drivers/net/phy/mscc.c | 120 -
 2 files changed, 131 insertions(+), 1 deletion(-)

-- 
2.7.4



[PATCH net-next 1/2] net: phy: Add Speed downshift set driver for Microsemi PHYs.

2016-10-14 Thread Raju Lakkaraju
From: Raju Lakkaraju 

For operation in cabling environments that are incompatible with
1000BAST-T, VSC8531 device provides an automatic link speed
downshift operation. When enabled, the device automatically changes
its 1000BAST-T auto-negotiation to the next slower speed after
a configured number of failed attempts at 1000BAST-T.
This feature is useful in setting up in networks using older cable
installations that include only pairs A and B, and not pairs C and D.

Signed-off-by: Raju Lakkaraju 
Signed-off-by: Allan W. Nielsen 
---
 .../devicetree/bindings/net/mscc-phy-vsc8531.txt   |  6 ++
 drivers/net/phy/mscc.c | 75 +-
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt 
b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
index bdefefc6..062d115 100644
--- a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
+++ b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
@@ -27,6 +27,11 @@ Optional properties:
  'vddmac'.
  Default value is 0%.
  Ref: Table:1 - Edge rate change (below).
+- downshift-cnt: When enabled, the device automatically 
changes its
+ 1000BAST-T auto-negotiation to the next slower speed
+ after a 'downshift-cnt' of failed attempts at
+ 1000BAST-T. Allowed values: 0, 2, 3, 4, 5.
+ 0 is default and will disable downshifting.
 
 Table: 1 - Edge rate change
 |
@@ -60,4 +65,5 @@ Example:
 compatible = "ethernet-phy-id0007.0570";
 vsc8531,vddmac = <3300>;
 vsc8531,edge-slowdown  = <7>;
+vsc8531,downshift-cnt   = <3>;
 };
diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 43a7545..e87d9f0 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -46,8 +46,15 @@ enum rgmii_rx_clock_delay {
 
 #define MSCC_EXT_PAGE_ACCESS 31
 #define MSCC_PHY_PAGE_STANDARD   0x /* Standard registers */
+#define MSCC_PHY_PAGE_EXTENDED   0x0001 /* Extended registers */
 #define MSCC_PHY_PAGE_EXTENDED_2 0x0002 /* Extended reg - page 2 */
 
+/* Extended Page 1 Registers */
+#define MSCC_PHY_ACTIPHY_CNTL20
+#define DOWNSHIFT_CNTL_MASK  0x000C
+#define DOWNSHIFT_EN 0x0010
+#define DOWNSHIFT_CNTL_POS   2
+
 /* Extended Page 2 Registers */
 #define MSCC_PHY_RGMII_CNTL  20
 #define RGMII_RX_CLK_DELAY_MASK  0x0070
@@ -75,6 +82,7 @@ enum rgmii_rx_clock_delay {
 
 struct vsc8531_private {
int rate_magic;
+   u8  downshift_magic;
 };
 
 #ifdef CONFIG_OF_MDIO
@@ -99,6 +107,31 @@ static int vsc85xx_phy_page_set(struct phy_device *phydev, 
u8 page)
return rc;
 }
 
+static int vsc85xx_downshift_set(struct phy_device *phydev, u8 magic)
+{
+   int rc;
+   u16 reg_val;
+
+   mutex_lock(>lock);
+   rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
+   if (rc != 0)
+   goto out_unlock;
+
+   reg_val = phy_read(phydev, MSCC_PHY_ACTIPHY_CNTL);
+   reg_val &= ~(DOWNSHIFT_CNTL_MASK);
+   reg_val |= magic;
+   rc = phy_write(phydev, MSCC_PHY_ACTIPHY_CNTL, reg_val);
+   if (rc != 0)
+   goto out_unlock;
+
+   rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
+
+out_unlock:
+   mutex_unlock(>lock);
+
+   return rc;
+}
+
 static int vsc85xx_wol_set(struct phy_device *phydev,
   struct ethtool_wolinfo *wol)
 {
@@ -239,11 +272,42 @@ static int vsc85xx_edge_rate_magic_get(struct phy_device 
*phydev)
 
return -EINVAL;
 }
+
+static int vsc85xx_downshift_magic_get(struct phy_device *phydev)
+{
+   int rc;
+   u8 ds;
+   struct device *dev = >mdio.dev;
+   struct device_node *of_node = dev->of_node;
+
+   if (!of_node)
+   return -ENODEV;
+
+   rc = of_property_read_u8(of_node, "vsc8531,downshift-cnt", );
+   if ((rc == -EINVAL) || (ds == 0))
+   return 0;
+   if (ds == 1 || ds > 5) {
+   phydev_err(phydev, "Invalid downshift count\n");
+   return -EINVAL;
+   }
+
+   /* ds is either 2,3,4 or 5 */
+   ds -= 2;
+   ds <<= DOWNSHIFT_CNTL_POS;
+   ds |= DOWNSHIFT_EN;
+
+   return ds;
+}
 #else
 static int vsc85xx_edge_rate_magic_get(struct phy_device *phydev)
 {
return 0;
 }
+
+static int vsc85xx_downshift_magic_get(struct phy_device *phydev)
+{
+   return 0;
+}
 #endif /* CONFIG_OF_MDIO */
 
 static int vsc85xx_edge_rate_cntl_set(struct phy_device *phydev, u8 edge_rate)
@@ -344,6 +408,10 @@ 

[PATCH 4.4 01/21] time: Add cycles to nanoseconds translation

2016-10-14 Thread Greg Kroah-Hartman
4.4-stable review patch.  If anyone has any objections, please let me know.

--

From: Christopher S. Hall 

commit 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 upstream.

The timekeeping code does not currently provide a way to translate
externally provided clocksource cycles to system time. The cycle count
is always provided by the result clocksource read() method internal to
the timekeeping code. The added function timekeeping_cycles_to_ns()
calculated a nanosecond value from a cycle count that can be added to
tk_read_base.base value yielding the current system time. This allows
clocksource cycle values external to the timekeeping code to provide a
cycle count that can be transformed to system time.

Cc: Prarit Bhargava 
Cc: Richard Cochran 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Andy Lutomirski 
Cc: kevin.b.stan...@intel.com
Cc: kevin.j.cla...@intel.com
Cc: h...@zytor.com
Cc: jeffrey.t.kirs...@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner 
Signed-off-by: Christopher S. Hall 
Signed-off-by: John Stultz 
Signed-off-by: Greg Kroah-Hartman 

---
 kernel/time/timekeeping.c |   25 +
 1 file changed, 21 insertions(+), 4 deletions(-)

--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = defaul
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
+{
+   s64 nsec;
+
+   nsec = delta * tkr->mult + tkr->xtime_nsec;
+   nsec >>= tkr->shift;
+
+   /* If arch requires, add in get_arch_timeoffset() */
+   return nsec + arch_gettimeoffset();
+}
+
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
cycle_t delta;
-   s64 nsec;
 
delta = timekeeping_get_delta(tkr);
+   return timekeeping_delta_to_ns(tkr, delta);
+}
 
-   nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+   cycle_t cycles)
+{
+   cycle_t delta;
 
-   /* If arch requires, add in get_arch_timeoffset() */
-   return nsec + arch_gettimeoffset();
+   /* calculate the delta since the last update_wall_time */
+   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+   return timekeeping_delta_to_ns(tkr, delta);
 }
 
 /**




[PATCH net-next 2/2] net: phy: Add Fast Link Failure - 2 set driver for Microsemi PHYs.

2016-10-14 Thread Raju Lakkaraju
From: Raju Lakkaraju 

VSC8531 Fast Link Failure 2 feature enables the PHY to indicate the
onset of a potential link failure in < 100 usec for 100BASE-TX
operation. FLF2 is supported through the MDINT (active low) pin.

Signed-off-by: Raju Lakkaraju 
Signed-off-by: Allan W. Nielsen 
---
 .../devicetree/bindings/net/mscc-phy-vsc8531.txt   |  6 +++
 drivers/net/phy/mscc.c | 45 ++
 2 files changed, 51 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt 
b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
index 062d115..472fc68 100644
--- a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
+++ b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt
@@ -32,6 +32,11 @@ Optional properties:
  after a 'downshift-cnt' of failed attempts at
  1000BAST-T. Allowed values: 0, 2, 3, 4, 5.
  0 is default and will disable downshifting.
+- flf2 : Fast Link Failure 2 (FLF2) feature enables the PHY
+ to indicate the onset of a potential link failure in
+ < 100 usec for 100BASE-TX operation. FLF2 is
+ supported through the MDINT (active low) pin.
+ Default will be disable flf2.
 
 Table: 1 - Edge rate change
 |
@@ -66,4 +71,5 @@ Example:
 vsc8531,vddmac = <3300>;
 vsc8531,edge-slowdown  = <7>;
 vsc8531,downshift-cnt   = <3>;
+   vsc8531,flf2;
 };
diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index e87d9f0..57bd628 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -57,6 +57,7 @@ enum rgmii_rx_clock_delay {
 
 /* Extended Page 2 Registers */
 #define MSCC_PHY_RGMII_CNTL  20
+#define FLF2_ENABLE  0x8000
 #define RGMII_RX_CLK_DELAY_MASK  0x0070
 #define RGMII_RX_CLK_DELAY_POS   4
 
@@ -83,6 +84,7 @@ enum rgmii_rx_clock_delay {
 struct vsc8531_private {
int rate_magic;
u8  downshift_magic;
+   bool flf2;  /* Fast Link Failure-2 Enable/Disable */
 };
 
 #ifdef CONFIG_OF_MDIO
@@ -107,6 +109,33 @@ static int vsc85xx_phy_page_set(struct phy_device *phydev, 
u8 page)
return rc;
 }
 
+static int vsc85xx_flf2_set(struct phy_device *phydev, bool op)
+{
+   int rc;
+   u16 reg_val;
+
+   mutex_lock(>lock);
+   rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
+   if (rc != 0)
+   goto out_unlock;
+
+   reg_val = phy_read(phydev, MSCC_PHY_RGMII_CNTL);
+   if (op)
+   reg_val |= FLF2_ENABLE;
+   else
+   reg_val &= ~FLF2_ENABLE;
+   rc = phy_write(phydev, MSCC_PHY_RGMII_CNTL, reg_val);
+   if (rc != 0)
+   goto out_unlock;
+
+   rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
+
+out_unlock:
+   mutex_unlock(>lock);
+
+   return rc;
+}
+
 static int vsc85xx_downshift_set(struct phy_device *phydev, u8 magic)
 {
int rc;
@@ -412,6 +441,10 @@ static int vsc85xx_config_init(struct phy_device *phydev)
if (rc)
return rc;
 
+   rc = vsc85xx_flf2_set(phydev, vsc8531->flf2);
+   if (rc)
+   return rc;
+
rc = genphy_config_init(phydev);
 
return rc;
@@ -449,6 +482,11 @@ static int vsc85xx_probe(struct phy_device *phydev)
int rate_magic;
int downshift_magic;
struct vsc8531_private *vsc8531;
+   struct device *dev = >mdio.dev;
+   struct device_node *of_node = dev->of_node;
+
+   if (!of_node)
+   return -ENODEV;
 
rate_magic = vsc85xx_edge_rate_magic_get(phydev);
if (rate_magic < 0)
@@ -466,6 +504,13 @@ static int vsc85xx_probe(struct phy_device *phydev)
vsc8531->rate_magic = rate_magic;
vsc8531->downshift_magic = downshift_magic;
 
+#ifdef CONFIG_OF_MDIO
+   /* Fast Link Failure 2 */
+   vsc8531->flf2 = of_property_read_bool(of_node, "vsc8531,flf2");
+#else
+   vsc8531->flf2 = 0;
+#endif
+
return 0;
 }
 
-- 
2.7.4



[PATCH net-next v2 4/6] fjes: ethtool -w and -W support for fjes driver

2016-10-14 Thread Taku Izumi
This patch adds implementation of supporting
ethtool -w and -W for fjes driver.

You can enable and disable firmware debug mode by
using ethtool -W, and also retrieve firmware
activity information by using ethtool -w.

This is useful for debugging.

Signed-off-by: Taku Izumi 
---
 drivers/net/fjes/fjes_ethtool.c |  63 ++
 drivers/net/fjes/fjes_hw.c  | 137 
 drivers/net/fjes/fjes_hw.h  |  15 +
 drivers/net/fjes/fjes_trace.h   |  69 
 4 files changed, 284 insertions(+)

diff --git a/drivers/net/fjes/fjes_ethtool.c b/drivers/net/fjes/fjes_ethtool.c
index 68ef287..6575f88 100644
--- a/drivers/net/fjes/fjes_ethtool.c
+++ b/drivers/net/fjes/fjes_ethtool.c
@@ -235,6 +235,66 @@ static void fjes_get_regs(struct net_device *netdev,
regs_buff[36] = rd32(XSCT_ICTL);
 }
 
+static int fjes_set_dump(struct net_device *netdev, struct ethtool_dump *dump)
+{
+   struct fjes_adapter *adapter = netdev_priv(netdev);
+   struct fjes_hw *hw = >hw;
+   int ret = 0;
+
+   if (dump->flag) {
+   if (hw->debug_mode)
+   return -EPERM;
+
+   hw->debug_mode = dump->flag;
+
+   /* enable debug mode */
+   mutex_lock(>hw_info.lock);
+   ret = fjes_hw_start_debug(hw);
+   mutex_unlock(>hw_info.lock);
+
+   if (ret)
+   hw->debug_mode = 0;
+   } else {
+   if (!hw->debug_mode)
+   return -EPERM;
+
+   /* disable debug mode */
+   mutex_lock(>hw_info.lock);
+   ret = fjes_hw_stop_debug(hw);
+   mutex_unlock(>hw_info.lock);
+   }
+
+   return ret;
+}
+
+static int fjes_get_dump_flag(struct net_device *netdev,
+ struct ethtool_dump *dump)
+{
+   struct fjes_adapter *adapter = netdev_priv(netdev);
+   struct fjes_hw *hw = >hw;
+
+   dump->len = hw->hw_info.trace_size;
+   dump->version = 1;
+   dump->flag = hw->debug_mode;
+
+   return 0;
+}
+
+static int fjes_get_dump_data(struct net_device *netdev,
+ struct ethtool_dump *dump, void *buf)
+{
+   struct fjes_adapter *adapter = netdev_priv(netdev);
+   struct fjes_hw *hw = >hw;
+   int ret = 0;
+
+   if (hw->hw_info.trace)
+   memcpy(buf, hw->hw_info.trace, hw->hw_info.trace_size);
+   else
+   ret = -EPERM;
+
+   return ret;
+}
+
 static const struct ethtool_ops fjes_ethtool_ops = {
.get_settings   = fjes_get_settings,
.get_drvinfo= fjes_get_drvinfo,
@@ -243,6 +303,9 @@ static const struct ethtool_ops fjes_ethtool_ops = {
.get_sset_count   = fjes_get_sset_count,
.get_regs   = fjes_get_regs,
.get_regs_len   = fjes_get_regs_len,
+   .set_dump   = fjes_set_dump,
+   .get_dump_flag  = fjes_get_dump_flag,
+   .get_dump_data  = fjes_get_dump_data,
 };
 
 void fjes_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c
index dba59dc..9c652c0 100644
--- a/drivers/net/fjes/fjes_hw.c
+++ b/drivers/net/fjes/fjes_hw.c
@@ -343,6 +343,9 @@ int fjes_hw_init(struct fjes_hw *hw)
 
ret = fjes_hw_setup(hw);
 
+   hw->hw_info.trace = vzalloc(FJES_DEBUG_BUFFER_SIZE);
+   hw->hw_info.trace_size = FJES_DEBUG_BUFFER_SIZE;
+
return ret;
 }
 
@@ -351,6 +354,18 @@ void fjes_hw_exit(struct fjes_hw *hw)
int ret;
 
if (hw->base) {
+
+   if (hw->debug_mode) {
+   /* disable debug mode */
+   mutex_lock(>hw_info.lock);
+   fjes_hw_stop_debug(hw);
+   mutex_unlock(>hw_info.lock);
+   }
+   vfree(hw->hw_info.trace);
+   hw->hw_info.trace = NULL;
+   hw->hw_info.trace_size = 0;
+   hw->debug_mode = 0;
+
ret = fjes_hw_reset(hw);
if (ret)
pr_err("%s: reset error", __func__);
@@ -1175,3 +1190,125 @@ static void fjes_hw_epstop_task(struct work_struct 
*work)
}
}
 }
+
+int fjes_hw_start_debug(struct fjes_hw *hw)
+{
+   union fjes_device_command_req *req_buf = hw->hw_info.req_buf;
+   union fjes_device_command_res *res_buf = hw->hw_info.res_buf;
+   enum fjes_dev_command_response_e ret;
+   int page_count;
+   int result = 0;
+   void *addr;
+   int i;
+
+   if (!hw->hw_info.trace)
+   return -EPERM;
+   memset(hw->hw_info.trace, 0, FJES_DEBUG_BUFFER_SIZE);
+
+   memset(req_buf, 0, hw->hw_info.req_buf_size);
+   memset(res_buf, 0, hw->hw_info.res_buf_size);
+
+   req_buf->start_trace.length =
+

[PATCH net-next v2 2/6] fjes: Enhance ethtool -S for fjes driver

2016-10-14 Thread Taku Izumi
This patch enhances ethtool -S for fjes driver so that
EP related statistics can be retrieved.

The following statistics can be displayed via ethtool -S:

 ep%d_com_regist_buf_exec
 ep%d_com_unregist_buf_exec
 ep%d_send_intr_rx
 ep%d_send_intr_unshare
 ep%d_send_intr_zoneupdate
 ep%d_recv_intr_rx
 ep%d_recv_intr_unshare
 ep%d_recv_intr_stop
 ep%d_recv_intr_zoneupdate
 ep%d_tx_buffer_full
 ep%d_tx_dropped_not_shared
 ep%d_tx_dropped_ver_mismatch
 ep%d_tx_dropped_buf_size_mismatch
 ep%d_tx_dropped_vlanid_mismatch

Signed-off-by: Taku Izumi 
---
 drivers/net/fjes/fjes_ethtool.c | 70 -
 drivers/net/fjes/fjes_hw.c  |  9 ++
 drivers/net/fjes/fjes_hw.h  | 19 +++
 drivers/net/fjes/fjes_main.c| 44 +++---
 4 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/drivers/net/fjes/fjes_ethtool.c b/drivers/net/fjes/fjes_ethtool.c
index 8397634..68ef287 100644
--- a/drivers/net/fjes/fjes_ethtool.c
+++ b/drivers/net/fjes/fjes_ethtool.c
@@ -49,10 +49,18 @@ static const struct fjes_stats fjes_gstrings_stats[] = {
FJES_STAT("tx_dropped", stats64.tx_dropped),
 };
 
+#define FJES_EP_STATS_LEN 14
+#define FJES_STATS_LEN \
+   (ARRAY_SIZE(fjes_gstrings_stats) + \
+((&((struct fjes_adapter *)netdev_priv(netdev))->hw)->max_epid - 1) * \
+FJES_EP_STATS_LEN)
+
 static void fjes_get_ethtool_stats(struct net_device *netdev,
   struct ethtool_stats *stats, u64 *data)
 {
struct fjes_adapter *adapter = netdev_priv(netdev);
+   struct fjes_hw *hw = >hw;
+   int epidx;
char *p;
int i;
 
@@ -61,11 +69,39 @@ static void fjes_get_ethtool_stats(struct net_device 
*netdev,
data[i] = (fjes_gstrings_stats[i].sizeof_stat == sizeof(u64))
? *(u64 *)p : *(u32 *)p;
}
+   for (epidx = 0; epidx < hw->max_epid; epidx++) {
+   if (epidx == hw->my_epid)
+   continue;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .com_regist_buf_exec;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .com_unregist_buf_exec;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.send_intr_rx;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.send_intr_unshare;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .send_intr_zoneupdate;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.recv_intr_rx;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.recv_intr_unshare;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.recv_intr_stop;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .recv_intr_zoneupdate;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats.tx_buffer_full;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .tx_dropped_not_shared;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .tx_dropped_ver_mismatch;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .tx_dropped_buf_size_mismatch;
+   data[i++] = hw->ep_shm_info[epidx].ep_stats
+   .tx_dropped_vlanid_mismatch;
+   }
 }
 
 static void fjes_get_strings(struct net_device *netdev,
 u32 stringset, u8 *data)
 {
+   struct fjes_adapter *adapter = netdev_priv(netdev);
+   struct fjes_hw *hw = >hw;
u8 *p = data;
int i;
 
@@ -76,6 +112,38 @@ static void fjes_get_strings(struct net_device *netdev,
   ETH_GSTRING_LEN);
p += ETH_GSTRING_LEN;
}
+   for (i = 0; i < hw->max_epid; i++) {
+   if (i == hw->my_epid)
+   continue;
+   sprintf(p, "ep%u_com_regist_buf_exec", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_com_unregist_buf_exec", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_send_intr_rx", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_send_intr_unshare", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_send_intr_zoneupdate", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_recv_intr_rx", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_recv_intr_unshare", i);
+   p += ETH_GSTRING_LEN;
+   sprintf(p, "ep%u_recv_intr_stop", i);
+   p += ETH_GSTRING_LEN;
+   

[PATCH net-next v2 6/6] fjes: Update fjes driver version : 1.2

2016-10-14 Thread Taku Izumi
Signed-off-by: Taku Izumi 
---
 drivers/net/fjes/fjes_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index 359e7a5..f36eb4a 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -30,7 +30,7 @@
 #include "fjes_trace.h"
 
 #define MAJ 1
-#define MIN 1
+#define MIN 2
 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN)
 #define DRV_NAME   "fjes"
 char fjes_driver_name[] = DRV_NAME;
-- 
2.6.6



[PATCH] r8169: set coherent DMA mask as well as streaming DMA mask

2016-10-14 Thread Ard Biesheuvel
PCI devices that are 64-bit DMA capable should set the coherent
DMA mask as well as the streaming DMA mask. On some architectures,
these are managed separately, and so the coherent DMA mask will be
left at its default value of 32 if it is not set explicitly. This
results in errors such as

 r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
 hwdev DMA mask = 0x, dev_addr = 0x0080fbfff000
 swiotlb: coherent allocation failed for device :02:00.0 size=4096
 CPU: 0 PID: 1062 Comm: systemd-udevd Not tainted 4.8.0+ #35
 Hardware name: AMD Seattle/Seattle, BIOS 10:53:24 Oct 13 2016

on systems without memory that is 32-bit addressable by PCI devices.

Signed-off-by: Ard Biesheuvel 
---
 drivers/net/ethernet/realtek/r8169.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index e55638c7505a..04957a36b11f 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8273,7 +8273,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
if ((sizeof(dma_addr_t) > 4) &&
(use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
  tp->mac_version >= RTL_GIGA_MAC_VER_18)) &&
-   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
+   !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
 
/* CPlusCmd Dual Access Cycle is only needed for non-PCIe */
if (!pci_is_pcie(pdev))
@@ -8281,6 +8282,8 @@ static int rtl_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
dev->features |= NETIF_F_HIGHDMA;
} else {
rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+   if (!rc)
+   rc = pci_set_consistent_dma_mask(pdev, 
DMA_BIT_MASK(32));
if (rc < 0) {
netif_err(tp, probe, dev, "DMA configuration failed\n");
goto err_out_unmap_4;
-- 
2.7.4



Re: Need help with mdiobus_register and phy

2016-10-14 Thread Timur Tabi

Andrew Lunn wrote:

Normally, a sleeping PHY does respond to MDIO. Otherwise, how do you
wake it?

So i assume this phy has some other means to wake it. What is this
means?


I'm guessing that someone has to call phy_resume() before/during the 
call to mdiobus_register, but I don't see how that's possible.


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


[PATCH net-next v2 5/6] fjes: Add debugfs entry for EP status information in fjes driver

2016-10-14 Thread Taku Izumi
This patch adds debugfs entry to show EP status information.
You can get each EP's status information like the following:

  # cat /sys/kernel/debug/fjes/fjes.0/status

EPIDSTATUS   SAME_ZONECONNECTED
ep0 shared   YY
ep1 ---
ep2 unshared NN
ep3 unshared NN
ep4 unshared NN
ep5 unshared NN
ep6 unshared NN
ep7 unshared NN

Signed-off-by: Taku Izumi 
---
 drivers/net/fjes/Makefile   |   2 +-
 drivers/net/fjes/fjes.h |  16 ++
 drivers/net/fjes/fjes_debugfs.c | 117 
 drivers/net/fjes/fjes_main.c|  12 -
 4 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/fjes/fjes_debugfs.c

diff --git a/drivers/net/fjes/Makefile b/drivers/net/fjes/Makefile
index 6705d1b..bc47b35 100644
--- a/drivers/net/fjes/Makefile
+++ b/drivers/net/fjes/Makefile
@@ -27,4 +27,4 @@
 
 obj-$(CONFIG_FUJITSU_ES) += fjes.o
 
-fjes-objs := fjes_main.o fjes_hw.o fjes_ethtool.o fjes_trace.o
+fjes-objs := fjes_main.o fjes_hw.o fjes_ethtool.o fjes_trace.o fjes_debugfs.o
diff --git a/drivers/net/fjes/fjes.h b/drivers/net/fjes/fjes.h
index a592fe2..0372be3 100644
--- a/drivers/net/fjes/fjes.h
+++ b/drivers/net/fjes/fjes.h
@@ -66,6 +66,10 @@ struct fjes_adapter {
bool interrupt_watch_enable;
 
struct fjes_hw hw;
+
+#ifdef CONFIG_DEBUG_FS
+   struct dentry *dbg_adapter;
+#endif
 };
 
 extern char fjes_driver_name[];
@@ -74,4 +78,16 @@ extern const u32 fjes_support_mtu[];
 
 void fjes_set_ethtool_ops(struct net_device *);
 
+#ifdef CONFIG_DEBUG_FS
+void fjes_dbg_adapter_init(struct fjes_adapter *adapter);
+void fjes_dbg_adapter_exit(struct fjes_adapter *adapter);
+void fjes_dbg_init(void);
+void fjes_dbg_exit(void);
+#else
+static inline void fjes_dbg_adapter_init(struct fjes_adapter *adapter) {}
+static inline void fjes_dbg_adapter_exit(struct fjes_adapter *adapter) {}
+static inline void fjes_dbg_init(void) {}
+static inline void fjes_dbg_exit(void) {}
+#endif /* CONFIG_DEBUG_FS */
+
 #endif /* FJES_H_ */
diff --git a/drivers/net/fjes/fjes_debugfs.c b/drivers/net/fjes/fjes_debugfs.c
new file mode 100644
index 000..30052eb
--- /dev/null
+++ b/drivers/net/fjes/fjes_debugfs.c
@@ -0,0 +1,117 @@
+/*
+ *  FUJITSU Extended Socket Network Device driver
+ *  Copyright (c) 2015-2016 FUJITSU LIMITED
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see .
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/* debugfs support for fjes driver */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include 
+#include 
+#include 
+
+#include "fjes.h"
+
+static struct dentry *fjes_debug_root;
+
+static const char * const ep_status_string[] = {
+   "unshared",
+   "shared",
+   "waiting",
+   "complete",
+};
+
+static int fjes_dbg_status_show(struct seq_file *m, void *v)
+{
+   struct fjes_adapter *adapter = m->private;
+   struct fjes_hw *hw = >hw;
+   int max_epid = hw->max_epid;
+   int my_epid = hw->my_epid;
+   int epidx;
+
+   seq_puts(m, "EPID\tSTATUS   SAME_ZONECONNECTED\n");
+   for (epidx = 0; epidx < max_epid; epidx++) {
+   if (epidx == my_epid) {
+   seq_printf(m, "ep%d\t%-16c %-16c %-16c\n",
+  epidx, '-', '-', '-');
+   } else {
+   seq_printf(m, "ep%d\t%-16s %-16c %-16c\n",
+  epidx,
+  
ep_status_string[fjes_hw_get_partner_ep_status(hw, epidx)],
+  fjes_hw_epid_is_same_zone(hw, epidx) ? 'Y' : 
'N',
+  fjes_hw_epid_is_shared(hw->hw_info.share, 
epidx) ? 'Y' : 'N');
+   }
+   }
+
+   return 0;
+}
+
+static int fjes_dbg_status_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, fjes_dbg_status_show, inode->i_private);
+}
+
+static const struct file_operations fjes_dbg_status_fops = {
+   .owner  = THIS_MODULE,
+   .open   = fjes_dbg_status_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   

[PATCH net-next v2 3/6] fjes: Add tracepoints in fjes driver

2016-10-14 Thread Taku Izumi
This patch adds tracepoints in fjes driver.
This is useful for debugging purpose.

Signed-off-by: Taku Izumi 
---
 drivers/net/fjes/Makefile |   2 +-
 drivers/net/fjes/fjes_hw.c|  25 +++-
 drivers/net/fjes/fjes_main.c  |   5 +
 drivers/net/fjes/fjes_trace.c |  30 
 drivers/net/fjes/fjes_trace.h | 311 ++
 5 files changed, 369 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/fjes/fjes_trace.c
 create mode 100644 drivers/net/fjes/fjes_trace.h

diff --git a/drivers/net/fjes/Makefile b/drivers/net/fjes/Makefile
index 523e3d7..6705d1b 100644
--- a/drivers/net/fjes/Makefile
+++ b/drivers/net/fjes/Makefile
@@ -27,4 +27,4 @@
 
 obj-$(CONFIG_FUJITSU_ES) += fjes.o
 
-fjes-objs := fjes_main.o fjes_hw.o fjes_ethtool.o
+fjes-objs := fjes_main.o fjes_hw.o fjes_ethtool.o fjes_trace.o
diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c
index 82b56e8..dba59dc 100644
--- a/drivers/net/fjes/fjes_hw.c
+++ b/drivers/net/fjes/fjes_hw.c
@@ -21,6 +21,7 @@
 
 #include "fjes_hw.h"
 #include "fjes.h"
+#include "fjes_trace.h"
 
 static void fjes_hw_update_zone_task(struct work_struct *);
 static void fjes_hw_epstop_task(struct work_struct *);
@@ -371,7 +372,7 @@ fjes_hw_issue_request_command(struct fjes_hw *hw,
enum fjes_dev_command_response_e ret = FJES_CMD_STATUS_UNKNOWN;
union REG_CR cr;
union REG_CS cs;
-   int timeout;
+   int timeout = FJES_COMMAND_REQ_TIMEOUT * 1000;
 
cr.reg = 0;
cr.bits.req_start = 1;
@@ -408,6 +409,8 @@ fjes_hw_issue_request_command(struct fjes_hw *hw,
}
}
 
+   trace_fjes_hw_issue_request_command(, , timeout, ret);
+
return ret;
 }
 
@@ -427,11 +430,13 @@ int fjes_hw_request_info(struct fjes_hw *hw)
res_buf->info.code = 0;
 
ret = fjes_hw_issue_request_command(hw, FJES_CMD_REQ_INFO);
+   trace_fjes_hw_request_info(hw, res_buf);
 
result = 0;
 
if (FJES_DEV_COMMAND_INFO_RES_LEN((*hw->hw_info.max_epid)) !=
res_buf->info.length) {
+   trace_fjes_hw_request_info_err("Invalid res_buf");
result = -ENOMSG;
} else if (ret == FJES_CMD_STATUS_NORMAL) {
switch (res_buf->info.code) {
@@ -448,6 +453,7 @@ int fjes_hw_request_info(struct fjes_hw *hw)
result = -EPERM;
break;
case FJES_CMD_STATUS_TIMEOUT:
+   trace_fjes_hw_request_info_err("Timeout");
result = -EBUSY;
break;
case FJES_CMD_STATUS_ERROR_PARAM:
@@ -512,6 +518,8 @@ int fjes_hw_register_buff_addr(struct fjes_hw *hw, int 
dest_epid,
res_buf->share_buffer.length = 0;
res_buf->share_buffer.code = 0;
 
+   trace_fjes_hw_register_buff_addr_req(req_buf, buf_pair);
+
ret = fjes_hw_issue_request_command(hw, FJES_CMD_REQ_SHARE_BUFFER);
 
timeout = FJES_COMMAND_REQ_BUFF_TIMEOUT * 1000;
@@ -532,16 +540,20 @@ int fjes_hw_register_buff_addr(struct fjes_hw *hw, int 
dest_epid,
 
result = 0;
 
+   trace_fjes_hw_register_buff_addr(res_buf, timeout);
+
if (res_buf->share_buffer.length !=
-   FJES_DEV_COMMAND_SHARE_BUFFER_RES_LEN)
+   FJES_DEV_COMMAND_SHARE_BUFFER_RES_LEN) {
+   trace_fjes_hw_register_buff_addr_err("Invalid res_buf");
result = -ENOMSG;
-   else if (ret == FJES_CMD_STATUS_NORMAL) {
+   } else if (ret == FJES_CMD_STATUS_NORMAL) {
switch (res_buf->share_buffer.code) {
case FJES_CMD_REQ_RES_CODE_NORMAL:
result = 0;
set_bit(dest_epid, >hw_info.buffer_share_bit);
break;
case FJES_CMD_REQ_RES_CODE_BUSY:
+   trace_fjes_hw_register_buff_addr_err("Busy Timeout");
result = -EBUSY;
break;
default:
@@ -554,6 +566,7 @@ int fjes_hw_register_buff_addr(struct fjes_hw *hw, int 
dest_epid,
result = -EPERM;
break;
case FJES_CMD_STATUS_TIMEOUT:
+   trace_fjes_hw_register_buff_addr_err("Timeout");
result = -EBUSY;
break;
case FJES_CMD_STATUS_ERROR_PARAM:
@@ -595,6 +608,7 @@ int fjes_hw_unregister_buff_addr(struct fjes_hw *hw, int 
dest_epid)
res_buf->unshare_buffer.length = 0;
res_buf->unshare_buffer.code = 0;
 
+   trace_fjes_hw_unregister_buff_addr_req(req_buf);
ret = fjes_hw_issue_request_command(hw, FJES_CMD_REQ_UNSHARE_BUFFER);
 
timeout = FJES_COMMAND_REQ_BUFF_TIMEOUT * 1000;
@@ -616,8 +630,11 @@ int fjes_hw_unregister_buff_addr(struct fjes_hw *hw, int 
dest_epid)
 
result = 0;
 
+   

  1   2   >