[PATCH net] nfp: flower: fix port metadata conversion bug

2018-07-27 Thread Jakub Kicinski
From: John Hurley 

Function nfp_flower_repr_get_type_and_port expects an enum nfp_repr_type
return value but, if the repr type is unknown, returns a value of type
enum nfp_flower_cmsg_port_type.  This means that if FW encodes the port
ID in a way the driver does not understand instead of dropping the frame
driver may attribute it to a physical port (uplink) provided the port
number is less than physical port count.

Fix this and ensure a net_device of NULL is returned if the repr can not
be determined.

Fixes: 1025351a88a4 ("nfp: add flower app")
Signed-off-by: John Hurley 
Signed-off-by: Jakub Kicinski 
---
This is low impact and unlikely, but also fix is trivial so either
net or net-next works.

 drivers/net/ethernet/netronome/nfp/flower/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 1decf3a1cad3..e57d23746585 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -80,7 +80,7 @@ nfp_flower_repr_get_type_and_port(struct nfp_app *app, u32 
port_id, u8 *port)
return NFP_REPR_TYPE_VF;
}
 
-   return NFP_FLOWER_CMSG_PORT_TYPE_UNSPEC;
+   return __NFP_REPR_TYPE_MAX;
 }
 
 static struct net_device *
@@ -91,6 +91,8 @@ nfp_flower_repr_get(struct nfp_app *app, u32 port_id)
u8 port = 0;
 
repr_type = nfp_flower_repr_get_type_and_port(app, port_id, );
+   if (repr_type > NFP_REPR_TYPE_MAX)
+   return NULL;
 
reprs = rcu_dereference(app->reprs[repr_type]);
if (!reprs)
-- 
2.17.1



[PATCH bpf] net: xsk: don't return frames via the allocator on error

2018-07-27 Thread Jakub Kicinski
xdp_return_buff() is used when frame has been successfully
handled (transmitted) or if an error occurred during delayed
processing and there is no way to report it back to
xdp_do_redirect().

In case of __xsk_rcv_zc() error is propagated all the way
back to the driver, so there is no need to call
xdp_return_buff().  Driver will recycle the frame anyway
after seeing that error happened.

Fixes: 173d3adb6f43 ("xsk: add zero-copy support for Rx")
Signed-off-by: Jakub Kicinski 
---
Patch could as well be applied to bpf-next, since there are no drivers
for AF_XDP in tree, yet.  xdp_umem_get_dma() and xdp_umem_get_data() are
not even exported.  But one could reimplent those...  

As I mentioned I think this makes the entire MEM_TYPE_ZERO_COPY allocator
handling dead code now :(

 net/xdp/xsk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72335c2e8108..4e937cd7c17d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff 
*xdp, u32 len)
 {
int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-   if (err) {
-   xdp_return_buff(xdp);
+   if (err)
xs->rx_dropped++;
-   }
 
return err;
 }
-- 
2.17.1



[PATCH net] enic: handle mtu change for vf properly

2018-07-27 Thread Govindarajulu Varadarajan
When driver gets notification for mtu change, driver does not handle it for
all RQs. It handles only RQ[0].

Fix is to use enic_change_mtu() interface to change mtu for vf.

Signed-off-by: Govindarajulu Varadarajan 
---
 drivers/net/ethernet/cisco/enic/enic_main.c | 78 +++--
 1 file changed, 27 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c 
b/drivers/net/ethernet/cisco/enic/enic_main.c
index 90c645b8538e..6b0376123cde 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2047,28 +2047,42 @@ static int enic_stop(struct net_device *netdev)
return 0;
 }
 
+static int _enic_change_mtu(struct net_device *netdev, int new_mtu)
+{
+   bool running = netif_running(netdev);
+   int err = 0;
+
+   ASSERT_RTNL();
+   if (running) {
+   err = enic_stop(netdev);
+   if (err)
+   return err;
+   }
+
+   netdev->mtu = new_mtu;
+
+   if (running) {
+   err = enic_open(netdev);
+   if (err)
+   return err;
+   }
+
+   return 0;
+}
+
 static int enic_change_mtu(struct net_device *netdev, int new_mtu)
 {
struct enic *enic = netdev_priv(netdev);
-   int running = netif_running(netdev);
 
if (enic_is_dynamic(enic) || enic_is_sriov_vf(enic))
return -EOPNOTSUPP;
 
-   if (running)
-   enic_stop(netdev);
-
-   netdev->mtu = new_mtu;
-
if (netdev->mtu > enic->port_mtu)
netdev_warn(netdev,
-   "interface MTU (%d) set higher than port MTU (%d)\n",
-   netdev->mtu, enic->port_mtu);
+   "interface MTU (%d) set higher than port MTU 
(%d)\n",
+   netdev->mtu, enic->port_mtu);
 
-   if (running)
-   enic_open(netdev);
-
-   return 0;
+   return _enic_change_mtu(netdev, new_mtu);
 }
 
 static void enic_change_mtu_work(struct work_struct *work)
@@ -2076,47 +2090,9 @@ static void enic_change_mtu_work(struct work_struct 
*work)
struct enic *enic = container_of(work, struct enic, change_mtu_work);
struct net_device *netdev = enic->netdev;
int new_mtu = vnic_dev_mtu(enic->vdev);
-   int err;
-   unsigned int i;
-
-   new_mtu = max_t(int, ENIC_MIN_MTU, min_t(int, ENIC_MAX_MTU, new_mtu));
 
rtnl_lock();
-
-   /* Stop RQ */
-   del_timer_sync(>notify_timer);
-
-   for (i = 0; i < enic->rq_count; i++)
-   napi_disable(>napi[i]);
-
-   vnic_intr_mask(>intr[0]);
-   enic_synchronize_irqs(enic);
-   err = vnic_rq_disable(>rq[0]);
-   if (err) {
-   rtnl_unlock();
-   netdev_err(netdev, "Unable to disable RQ.\n");
-   return;
-   }
-   vnic_rq_clean(>rq[0], enic_free_rq_buf);
-   vnic_cq_clean(>cq[0]);
-   vnic_intr_clean(>intr[0]);
-
-   /* Fill RQ with new_mtu-sized buffers */
-   netdev->mtu = new_mtu;
-   vnic_rq_fill(>rq[0], enic_rq_alloc_buf);
-   /* Need at least one buffer on ring to get going */
-   if (vnic_rq_desc_used(>rq[0]) == 0) {
-   rtnl_unlock();
-   netdev_err(netdev, "Unable to alloc receive buffers.\n");
-   return;
-   }
-
-   /* Start RQ */
-   vnic_rq_enable(>rq[0]);
-   napi_enable(>napi[0]);
-   vnic_intr_unmask(>intr[0]);
-   enic_notify_timer_start(enic);
-
+   (void)_enic_change_mtu(netdev, new_mtu);
rtnl_unlock();
 
netdev_info(netdev, "interface MTU set as %d\n", netdev->mtu);
-- 
2.17.1



[net-next V2 10/12] net/mlx5e: Vxlan, add sync lock for add/del vxlan port

2018-07-27 Thread Saeed Mahameed
Vxlan API can and will be called from different mlx5 modules, we should
not count on mlx5e private state lock only, hence we introduce a vxlan
private mutex to sync between add/del vxlan port operations.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c| 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index c9a50753ab23..9a8fd762167b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -39,9 +39,10 @@
 struct mlx5_vxlan {
struct mlx5_core_dev*mdev;
spinlock_t  lock; /* protect vxlan table */
-   int num_ports;
/* max_num_ports is usuallly 4, 16 buckets is more than enough */
DECLARE_HASHTABLE(htable, 4);
+   int num_ports;
+   struct mutexsync_lock; /* sync add/del port HW 
operations */
 };
 
 struct mlx5_vxlan_port {
@@ -115,17 +116,18 @@ int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
return 0;
}
 
+   mutex_lock(>sync_lock);
if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
mlx5_core_info(vxlan->mdev,
   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
   port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
ret = -ENOSPC;
-   return ret;
+   goto unlock;
}
 
ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
if (ret)
-   return ret;
+   goto unlock;
 
vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
if (!vxlanp) {
@@ -141,10 +143,14 @@ int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
spin_unlock_bh(>lock);
 
vxlan->num_ports++;
+   mutex_unlock(>sync_lock);
return 0;
 
 err_delete_port:
mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+
+unlock:
+   mutex_unlock(>sync_lock);
return ret;
 }
 
@@ -154,6 +160,8 @@ int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
bool remove = false;
int ret = 0;
 
+   mutex_lock(>sync_lock);
+
spin_lock_bh(>lock);
vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
if (!vxlanp) {
@@ -174,6 +182,9 @@ int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
kfree(vxlanp);
vxlan->num_ports--;
}
+
+   mutex_unlock(>sync_lock);
+
return ret;
 }
 
@@ -189,6 +200,7 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev 
*mdev)
return ERR_PTR(-ENOMEM);
 
vxlan->mdev = mdev;
+   mutex_init(>sync_lock);
spin_lock_init(>lock);
hash_init(vxlan->htable);
 
-- 
2.17.0



[net-next V2 07/12] net/mlx5e: Vxlan, rename struct mlx5e_vxlan to mlx5_vxlan_port

2018-07-27 Thread Saeed Mahameed
The name mlx5e_vxlan will be used in downstream patch to describe
mlx5 vxlan structure that will replace mlx5e_vxlan_db.

Hence we rename struct mlx5e_vxlan to mlx5_vxlan_port which describes a
mlx5 vxlan port.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 63 +--
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  4 +-
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 9a8ca532a443..a2b48ad77f26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,7 +36,7 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
-struct mlx5e_vxlan {
+struct mlx5_vxlan_port {
struct hlist_node hlist;
atomic_t refcount;
u16 udp_port;
@@ -83,40 +83,40 @@ static int mlx5e_vxlan_core_del_port_cmd(struct 
mlx5_core_dev *mdev, u16 port)
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 }
 
-static struct mlx5e_vxlan *mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv 
*priv,
- u16 port)
+static struct mlx5_vxlan_port*
+mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan*vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
-   hash_for_each_possible(vxlan_db->htable, vxlan, hlist, port) {
-   if (vxlan->udp_port == port)
-   return vxlan;
+   hash_for_each_possible(vxlan_db->htable, vxlanp, hlist, port) {
+   if (vxlanp->udp_port == port)
+   return vxlanp;
}
 
return NULL;
 }
 
-struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
+struct mlx5_vxlan_port *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 
port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
spin_lock_bh(_db->lock);
-   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
+   vxlanp = mlx5e_vxlan_lookup_port_locked(priv, port);
spin_unlock_bh(_db->lock);
 
-   return vxlan;
+   return vxlanp;
 }
 
 void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
-   vxlan = mlx5e_vxlan_lookup_port(priv, port);
-   if (vxlan) {
-   atomic_inc(>refcount);
+   vxlanp = mlx5e_vxlan_lookup_port(priv, port);
+   if (vxlanp) {
+   atomic_inc(>refcount);
return;
}
 
@@ -130,15 +130,15 @@ void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 
port)
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
return;
 
-   vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
-   if (!vxlan)
+   vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
+   if (!vxlanp)
goto err_delete_port;
 
-   vxlan->udp_port = port;
-   atomic_set(>refcount, 1);
+   vxlanp->udp_port = port;
+   atomic_set(>refcount, 1);
 
spin_lock_bh(_db->lock);
-   hash_add(vxlan_db->htable, >hlist, port);
+   hash_add(vxlan_db->htable, >hlist, port);
spin_unlock_bh(_db->lock);
 
vxlan_db->num_ports++;
@@ -151,17 +151,16 @@ void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 
port)
 void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
bool remove = false;
 
spin_lock_bh(_db->lock);
-
-   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
-   if (!vxlan)
+   vxlanp = mlx5e_vxlan_lookup_port_locked(priv, port);
+   if (!vxlanp)
goto out_unlock;
 
-   if (atomic_dec_and_test(>refcount)) {
-   hash_del(>hlist);
+   if (atomic_dec_and_test(>refcount)) {
+   hash_del(>hlist);
remove = true;
}
 
@@ -170,7 +169,7 @@ void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 
if (remove) {
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
-   kfree(vxlan);
+   kfree(vxlanp);
vxlan_db->num_ports--;
}
 }
@@ -178,14 +177,14 @@ void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 
port)
 void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
struct hlist_node *tmp;
int bkt;
 
/* Lockless since we are the only hash table consumers, wq and TX are 
disabled */
-   hash_for_each_safe(vxlan_db->htable, bkt, tmp, vxlan, 

[net-next V2 04/12] net/mlx5e: Vxlan, cleanup an unused member in vxlan work

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

Cleanup the sa_family member of the vxlan work, it is unused/needed
anywhere in the code.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c   | 4 +---
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h   | 4 +---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fad947079a43..14a201cbb0a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3980,7 +3980,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (!mlx5e_vxlan_allowed(priv->mdev))
return;
 
-   mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1);
+   mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
 }
 
 static void mlx5e_del_vxlan_port(struct net_device *netdev,
@@ -3994,7 +3994,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (!mlx5e_vxlan_allowed(priv->mdev))
return;
 
-   mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
+   mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
 }
 
 static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 3c0ea9bc20e3..4b9190d677fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -191,8 +191,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
kfree(vxlan_work);
 }
 
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
-   u16 port, int add)
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
 {
struct mlx5e_vxlan_work *vxlan_work;
 
@@ -207,7 +206,6 @@ void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, 
sa_family_t sa_family,
 
vxlan_work->priv = priv;
vxlan_work->port = port;
-   vxlan_work->sa_family = sa_family;
queue_work(priv->wq, _work->work);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
index 52c41c22235d..51f19e3e5784 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -44,7 +44,6 @@ struct mlx5e_vxlan {
 struct mlx5e_vxlan_work {
struct work_struct  work;
struct mlx5e_priv   *priv;
-   sa_family_t sa_family;
u16 port;
 };
 
@@ -57,8 +56,7 @@ static inline bool mlx5e_vxlan_allowed(struct mlx5_core_dev 
*mdev)
 void mlx5e_vxlan_init(struct mlx5e_priv *priv);
 void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv);
 
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
-   u16 port, int add);
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add);
 struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port);
 
 #endif /* __MLX5_VXLAN_H__ */
-- 
2.17.0



[net-next V2 06/12] net/mlx5e: Vxlan, move netdev only logic to en_main.c

2018-07-27 Thread Saeed Mahameed
Create a direct vxlan API to add and delete vxlan ports from HW.
+void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port);

And move vxlan_add/del_work to en_main.c since they are netdev only
logic.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 51 +
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 55 +++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   | 16 +-
 3 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 14a201cbb0a4..7a6b78e3b5f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3969,6 +3969,57 @@ static int mlx5e_get_vf_stats(struct net_device *dev,
 }
 #endif
 
+struct mlx5e_vxlan_work {
+   struct work_struct  work;
+   struct mlx5e_priv   *priv;
+   u16 port;
+};
+
+static void mlx5e_vxlan_add_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_add_port(priv, port);
+   mutex_unlock(>state_lock);
+
+   kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_del_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_del_port(priv, port);
+   mutex_unlock(>state_lock);
+   kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
+{
+   struct mlx5e_vxlan_work *vxlan_work;
+
+   vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
+   if (!vxlan_work)
+   return;
+
+   if (add)
+   INIT_WORK(_work->work, mlx5e_vxlan_add_work);
+   else
+   INIT_WORK(_work->work, mlx5e_vxlan_del_work);
+
+   vxlan_work->priv = priv;
+   vxlan_work->port = port;
+   queue_work(priv->wq, _work->work);
+}
+
 static void mlx5e_add_vxlan_port(struct net_device *netdev,
 struct udp_tunnel_info *ti)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index baeac5922e8c..9a8ca532a443 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,7 +36,11 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
-static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+struct mlx5e_vxlan {
+   struct hlist_node hlist;
+   atomic_t refcount;
+   u16 udp_port;
+};
 
 void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 {
@@ -105,7 +109,7 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct 
mlx5e_priv *priv, u16 port)
return vxlan;
 }
 
-static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
+void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
@@ -144,21 +148,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 }
 
-static void mlx5e_vxlan_add_work(struct work_struct *work)
-{
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
-   u16 port = vxlan_work->port;
-
-   mutex_lock(>state_lock);
-   mlx5e_vxlan_add_port(priv, port);
-   mutex_unlock(>state_lock);
-
-   kfree(vxlan_work);
-}
-
-static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
+void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
@@ -185,37 +175,6 @@ static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, 
u16 port)
}
 }
 
-static void mlx5e_vxlan_del_work(struct work_struct *work)
-{
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
-   u16 port = vxlan_work->port;
-
-   mutex_lock(>state_lock);
-   mlx5e_vxlan_del_port(priv, port);
-   mutex_unlock(>state_lock);
-   kfree(vxlan_work);
-}
-
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
-{
-   struct mlx5e_vxlan_work *vxlan_work;
-
-   vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
-   if (!vxlan_work)
-   return;
-

[net-next V2 09/12] net/mlx5e: Vxlan, return values for add/del port

2018-07-27 Thread Saeed Mahameed
For a better API mlx5_vxlan_{add/del}_port can fail, make them return
error values.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 28 +--
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  4 +--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 759260f52bdd..c9a50753ab23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -104,29 +104,34 @@ struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct 
mlx5_vxlan *vxlan, u16 por
return vxlanp;
 }
 
-void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
 {
struct mlx5_vxlan_port *vxlanp;
+   int ret = -ENOSPC;
 
vxlanp = mlx5_vxlan_lookup_port(vxlan, port);
if (vxlanp) {
atomic_inc(>refcount);
-   return;
+   return 0;
}
 
if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
mlx5_core_info(vxlan->mdev,
   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
   port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
-   return;
+   ret = -ENOSPC;
+   return ret;
}
 
-   if (mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port))
-   return;
+   ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
+   if (ret)
+   return ret;
 
vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
-   if (!vxlanp)
+   if (!vxlanp) {
+   ret = -ENOMEM;
goto err_delete_port;
+   }
 
vxlanp->udp_port = port;
atomic_set(>refcount, 1);
@@ -136,21 +141,25 @@ void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
spin_unlock_bh(>lock);
 
vxlan->num_ports++;
-   return;
+   return 0;
 
 err_delete_port:
mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+   return ret;
 }
 
-void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
 {
struct mlx5_vxlan_port *vxlanp;
bool remove = false;
+   int ret = 0;
 
spin_lock_bh(>lock);
vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
-   if (!vxlanp)
+   if (!vxlanp) {
+   ret = -ENOENT;
goto out_unlock;
+   }
 
if (atomic_dec_and_test(>refcount)) {
hash_del(>hlist);
@@ -165,6 +174,7 @@ void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
kfree(vxlanp);
vxlan->num_ports--;
}
+   return ret;
 }
 
 struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
index 9d6327321814..fd874a30c4d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -49,8 +49,8 @@ static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan 
*vxlan)
 
 struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
 void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
-void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
-void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
 struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 
port);
 
 #else
-- 
2.17.0



[net-next V2 12/12] net/mlx5e: Issue direct lookup on vxlan ports by vport representors

2018-07-27 Thread Saeed Mahameed
Remove uplink representor netdevice private structure lookup, and use
mlx5 core handle directly from representor private structure to lookup
vxlan ports.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 288a57f76e84..c28fe469b04a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1124,16 +1124,12 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
skb_flow_dissector_target(f->dissector,
  FLOW_DISSECTOR_KEY_ENC_PORTS,
  f->mask);
-   struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-   struct mlx5e_rep_priv *uplink_rpriv = 
mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
-   struct net_device *up_dev = uplink_rpriv->netdev;
-   struct mlx5e_priv *up_priv = netdev_priv(up_dev);
 
/* Full udp dst port must be given */
if (memchr_inv(>dst, 0xff, sizeof(mask->dst)))
goto vxlan_match_offload_err;
 
-   if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, 
be16_to_cpu(key->dst)) &&
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, 
be16_to_cpu(key->dst)) &&
MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap))
parse_vxlan_attr(spec, f);
else {
@@ -2533,11 +2529,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
  struct mlx5e_tc_flow *flow)
 {
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-   struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw,
-  
REP_ETH);
-   struct net_device *up_dev = uplink_rpriv->netdev;
unsigned short family = ip_tunnel_info_af(tun_info);
-   struct mlx5e_priv *up_priv = netdev_priv(up_dev);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct ip_tunnel_key *key = _info->key;
struct mlx5e_encap_entry *e;
@@ -2557,7 +2549,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
return -EOPNOTSUPP;
}
 
-   if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, 
be16_to_cpu(key->tp_dst)) &&
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) 
&&
MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
tunnel_type = MLX5_HEADER_TYPE_VXLAN;
} else {
-- 
2.17.0



[net-next V2 11/12] net/mlx5e: Vxlan, move vxlan logic to core driver

2018-07-27 Thread Saeed Mahameed
Move vxlan logic and objects to mlx5 core dirver.
Since it going to be used from different mlx5 interfaces.
e.g. mlx5e PF NIC netdev and mlx5e E-Switch representors.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 --
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 21 ---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |  6 +++---
 .../mellanox/mlx5/core/{ => lib}/vxlan.c  |  0
 .../mellanox/mlx5/core/{ => lib}/vxlan.h  |  0
 .../net/ethernet/mellanox/mlx5/core/main.c|  5 +
 include/linux/mlx5/driver.h   |  2 ++
 8 files changed, 21 insertions(+), 19 deletions(-)
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.c (100%)
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h (100%)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index ae2bdcb1647c..f20fda1ced4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
-   en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \
-   en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
+   en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
+   en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o lib/vxlan.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1bd4536b9061..c7ed3d20fd54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,7 +52,6 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
-#include "vxlan.h"
 
 struct page_pool;
 
@@ -812,7 +811,6 @@ struct mlx5e_priv {
u32tx_rates[MLX5E_MAX_NUM_SQS];
 
struct mlx5e_flow_steering fs;
-   struct mlx5_vxlan  *vxlan;
 
struct workqueue_struct*wq;
struct work_struct update_carrier_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ef4b2f0c427c..fde35021a257 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -45,7 +45,7 @@
 #include "en_accel/tls.h"
 #include "accel/ipsec.h"
 #include "accel/tls.h"
-#include "vxlan.h"
+#include "lib/vxlan.h"
 #include "en/port.h"
 #include "en/xdp.h"
 
@@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev)
mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
mutex_unlock(>state_lock);
 
-   if (mlx5_vxlan_allowed(priv->vxlan))
+   if (mlx5_vxlan_allowed(priv->mdev->vxlan))
udp_tunnel_get_rx_info(netdev);
 
return err;
@@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5_vxlan_add_port(priv->vxlan, port);
+   mlx5_vxlan_add_port(priv->mdev->vxlan, port);
mutex_unlock(>state_lock);
 
kfree(vxlan_work);
@@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5_vxlan_del_port(priv->vxlan, port);
+   mlx5_vxlan_del_port(priv->mdev->vxlan, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
@@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5_vxlan_allowed(priv->vxlan))
+   if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
@@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5_vxlan_allowed(priv->vxlan))
+   if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
@@ -4076,7 +4076,7 @@ static netdev_features_t 
mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
port = be16_to_cpu(udph->dest);
 
/* Verify if UDP port is being offloaded by HW */
-   if (mlx5_vxlan_lookup_port(priv->vxlan, port))
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port))
return features;
}
 
@@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)

[net-next V2 08/12] net/mlx5e: Vxlan, rename from mlx5e to mlx5

2018-07-27 Thread Saeed Mahameed
Rename vxlan functions from mlx5e_vxlan_* to mlx5_vxlan_*.
Rename mlx5e_vxlan_db to mlx5_vxlan and move it from en.h to vxlan.c
since it is not related to mlx5e anymore.

Allocate mlx5_vxlan structure dynamically in order to make it easier to
move later to core driver and to make it private in vxlan.c.

This is in preparation to move vxlan API to mlx5 core.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  21 ++--
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |   4 +-
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 118 ++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  30 +++--
 5 files changed, 104 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6878925c3abf..1bd4536b9061 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,6 +52,7 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
+#include "vxlan.h"
 
 struct page_pool;
 
@@ -654,13 +655,6 @@ enum {
MLX5E_STATE_DESTROYING,
 };
 
-struct mlx5e_vxlan_db {
-   spinlock_t  lock; /* protect vxlan table */
-   /* max_num_ports is usuallly 4, 16 buckets is more than enough */
-   DECLARE_HASHTABLE(htable, 4);
-   int num_ports;
-};
-
 struct mlx5e_l2_rule {
u8  addr[ETH_ALEN + 2];
struct mlx5_flow_handle *rule;
@@ -818,7 +812,7 @@ struct mlx5e_priv {
u32tx_rates[MLX5E_MAX_NUM_SQS];
 
struct mlx5e_flow_steering fs;
-   struct mlx5e_vxlan_db  vxlan;
+   struct mlx5_vxlan  *vxlan;
 
struct workqueue_struct*wq;
struct work_struct update_carrier_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7a6b78e3b5f7..ef4b2f0c427c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev)
mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
mutex_unlock(>state_lock);
 
-   if (mlx5e_vxlan_allowed(priv->mdev))
+   if (mlx5_vxlan_allowed(priv->vxlan))
udp_tunnel_get_rx_info(netdev);
 
return err;
@@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5e_vxlan_add_port(priv, port);
+   mlx5_vxlan_add_port(priv->vxlan, port);
mutex_unlock(>state_lock);
 
kfree(vxlan_work);
@@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5e_vxlan_del_port(priv, port);
+   mlx5_vxlan_del_port(priv->vxlan, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
@@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5e_vxlan_allowed(priv->mdev))
+   if (!mlx5_vxlan_allowed(priv->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
@@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5e_vxlan_allowed(priv->mdev))
+   if (!mlx5_vxlan_allowed(priv->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
@@ -4076,7 +4076,7 @@ static netdev_features_t 
mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
port = be16_to_cpu(udph->dest);
 
/* Verify if UDP port is being offloaded by HW */
-   if (mlx5e_vxlan_lookup_port(priv, port))
+   if (mlx5_vxlan_lookup_port(priv->vxlan, port))
return features;
}
 
@@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)
netdev->hw_features  |= NETIF_F_HW_VLAN_CTAG_FILTER;
netdev->hw_features  |= NETIF_F_HW_VLAN_STAG_TX;
 
-   if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, 
tunnel_stateless_gre)) {
+   if (mlx5_vxlan_allowed(priv->vxlan) || MLX5_CAP_ETH(mdev, 
tunnel_stateless_gre)) {
netdev->hw_enc_features |= NETIF_F_IP_CSUM;
netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
netdev->hw_enc_features |= NETIF_F_TSO;
@@ -4656,7 +4656,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)
netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
}
 
-   if (mlx5e_vxlan_allowed(mdev)) {
+   if 

[net-next V2 05/12] net/mlx5e: Vxlan, add direct delete function

2018-07-27 Thread Saeed Mahameed
Add direct vxlan delete function to be called from vxlan_delete_work.
Needed in downstream patch.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 4b9190d677fc..baeac5922e8c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -158,18 +158,14 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
kfree(vxlan_work);
 }
 
-static void mlx5e_vxlan_del_work(struct work_struct *work)
+static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   u16 port = vxlan_work->port;
struct mlx5e_vxlan *vxlan;
bool remove = false;
 
-   mutex_lock(>state_lock);
spin_lock_bh(_db->lock);
+
vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
if (!vxlan)
goto out_unlock;
@@ -187,6 +183,17 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
kfree(vxlan);
vxlan_db->num_ports--;
}
+}
+
+static void mlx5e_vxlan_del_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_del_port(priv, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
-- 
2.17.0



[net-next V2 03/12] net/mlx5e: Vxlan, replace ports radix-tree with hash table

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The VXLAN database is accessed in the data path for each VXLAN TX skb in
order to check whether the UDP port is being offloaded or not.
The number of elements in the database is relatively small, we can
simplify the radix-tree to a hash table and speedup the lookup process.

Measuring mlx5e_vxlan_lookup_port execution time:

  Radix Tree   Hash Table
 ---  
  Single Stream   161 ns   79  ns (51% improvement)
  Multi Stream259 ns   136 ns (47% improvement)

Measuring UDP stream packet rate, single fully utilized TX core:
Radix Tree: 498,300 PPS
Hash Table: 555,468 PPS (11% improvement)

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +-
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 41 +++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  1 +
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c4d4db8722f5..6878925c3abf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -656,7 +656,8 @@ enum {
 
 struct mlx5e_vxlan_db {
spinlock_t  lock; /* protect vxlan table */
-   struct radix_tree_root  tree;
+   /* max_num_ports is usuallly 4, 16 buckets is more than enough */
+   DECLARE_HASHTABLE(htable, 4);
int num_ports;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index e3af2efe18ce..3c0ea9bc20e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -43,7 +43,7 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
 
spin_lock_init(_db->lock);
-   INIT_RADIX_TREE(_db->tree, GFP_ATOMIC);
+   hash_init(vxlan_db->htable);
 
if (mlx5e_vxlan_allowed(priv->mdev))
/* Hardware adds 4789 by default.
@@ -79,13 +79,27 @@ static int mlx5e_vxlan_core_del_port_cmd(struct 
mlx5_core_dev *mdev, u16 port)
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 }
 
+static struct mlx5e_vxlan *mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv 
*priv,
+ u16 port)
+{
+   struct mlx5e_vxlan_db *vxlan_db = >vxlan;
+   struct mlx5e_vxlan*vxlan;
+
+   hash_for_each_possible(vxlan_db->htable, vxlan, hlist, port) {
+   if (vxlan->udp_port == port)
+   return vxlan;
+   }
+
+   return NULL;
+}
+
 struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
 
spin_lock_bh(_db->lock);
-   vxlan = radix_tree_lookup(_db->tree, port);
+   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
spin_unlock_bh(_db->lock);
 
return vxlan;
@@ -95,7 +109,6 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
-   int err;
 
vxlan = mlx5e_vxlan_lookup_port(priv, port);
if (vxlan) {
@@ -121,16 +134,12 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
atomic_set(>refcount, 1);
 
spin_lock_bh(_db->lock);
-   err = radix_tree_insert(_db->tree, vxlan->udp_port, vxlan);
+   hash_add(vxlan_db->htable, >hlist, port);
spin_unlock_bh(_db->lock);
-   if (err)
-   goto err_free;
 
vxlan_db->num_ports++;
return;
 
-err_free:
-   kfree(vxlan);
 err_delete_port:
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 }
@@ -161,12 +170,12 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
 
mutex_lock(>state_lock);
spin_lock_bh(_db->lock);
-   vxlan = radix_tree_lookup(_db->tree, port);
+   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
if (!vxlan)
goto out_unlock;
 
if (atomic_dec_and_test(>refcount)) {
-   radix_tree_delete(_db->tree, port);
+   hash_del(>hlist);
remove = true;
}
 
@@ -206,13 +215,13 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
-   unsigned int port = 0;
+   struct hlist_node *tmp;
+   int bkt;
 
-   /* Lockless since we are the only radix-tree consumers, wq is disabled 
*/
-   while (radix_tree_gang_lookup(_db->tree, (void **), port, 
1)) {
-   port = vxlan->udp_port;
-   radix_tree_delete(_db->tree, port);
-   mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+   /* 

[net-next V2 02/12] net/mlx5e: Vxlan, check maximum number of UDP ports

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The NIC has a limited number of offloaded VXLAN UDP ports (usually 4).
Instead of letting the firmware fail when trying to add more ports than
it can handle, let the driver check it on its own.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|  1 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 14 ++
 include/linux/mlx5/mlx5_ifc.h   |  4 +++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c41cfc2a4b70..c4d4db8722f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -657,6 +657,7 @@ enum {
 struct mlx5e_vxlan_db {
spinlock_t  lock; /* protect vxlan table */
struct radix_tree_root  tree;
+   int num_ports;
 };
 
 struct mlx5e_l2_rule {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2f68d13e..e3af2efe18ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -52,6 +52,11 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
mlx5e_vxlan_add_port(priv, 4789);
 }
 
+static inline u8 mlx5e_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
+{
+   return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
+}
+
 static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
 {
u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
@@ -98,6 +103,13 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
return;
}
 
+   if (vxlan_db->num_ports >= mlx5e_vxlan_max_udp_ports(priv->mdev)) {
+   netdev_info(priv->netdev,
+   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
+   port, mlx5e_vxlan_max_udp_ports(priv->mdev));
+   return;
+   }
+
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
return;
 
@@ -114,6 +126,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
if (err)
goto err_free;
 
+   vxlan_db->num_ports++;
return;
 
 err_free:
@@ -163,6 +176,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
if (remove) {
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
kfree(vxlan);
+   vxlan_db->num_ports--;
}
mutex_unlock(>state_lock);
kfree(vxlan_work);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 22f54bedfaae..60c2308fe062 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -668,7 +668,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
u8 swp[0x1];
u8 swp_csum[0x1];
u8 swp_lso[0x1];
-   u8 reserved_at_23[0x1b];
+   u8 reserved_at_23[0xd];
+   u8 max_vxlan_udp_ports[0x8];
+   u8 reserved_at_38[0x6];
u8 max_geneve_opt_len[0x1];
u8 tunnel_stateless_geneve_rx[0x1];
 
-- 
2.17.0



[net-next V2 01/12] net/mlx5e: Vxlan, reflect 4789 UDP port default addition to software database

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The hardware offloads 4789 UDP port (default VXLAN port) automatically.
Add it to the software database as well in order to reflect the hardware
state appropriately.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 40 +--
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2f74953e4561..2f68d13e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,12 +36,20 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
+static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+
 void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
 
spin_lock_init(_db->lock);
INIT_RADIX_TREE(_db->tree, GFP_ATOMIC);
+
+   if (mlx5e_vxlan_allowed(priv->mdev))
+   /* Hardware adds 4789 by default.
+* Lockless since we are the only hash table consumers, wq and 
TX are disabled.
+*/
+   mlx5e_vxlan_add_port(priv, 4789);
 }
 
 static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
@@ -78,25 +86,20 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct 
mlx5e_priv *priv, u16 port)
return vxlan;
 }
 
-static void mlx5e_vxlan_add_port(struct work_struct *work)
+static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   u16 port = vxlan_work->port;
struct mlx5e_vxlan *vxlan;
int err;
 
-   mutex_lock(>state_lock);
vxlan = mlx5e_vxlan_lookup_port(priv, port);
if (vxlan) {
atomic_inc(>refcount);
-   goto free_work;
+   return;
}
 
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
-   goto free_work;
+   return;
 
vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
if (!vxlan)
@@ -111,18 +114,29 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
if (err)
goto err_free;
 
-   goto free_work;
+   return;
 
 err_free:
kfree(vxlan);
 err_delete_port:
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
-free_work:
+}
+
+static void mlx5e_vxlan_add_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_add_port(priv, port);
mutex_unlock(>state_lock);
+
kfree(vxlan_work);
 }
 
-static void mlx5e_vxlan_del_port(struct work_struct *work)
+static void mlx5e_vxlan_del_work(struct work_struct *work)
 {
struct mlx5e_vxlan_work *vxlan_work =
container_of(work, struct mlx5e_vxlan_work, work);
@@ -164,9 +178,9 @@ void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, 
sa_family_t sa_family,
return;
 
if (add)
-   INIT_WORK(_work->work, mlx5e_vxlan_add_port);
+   INIT_WORK(_work->work, mlx5e_vxlan_add_work);
else
-   INIT_WORK(_work->work, mlx5e_vxlan_del_port);
+   INIT_WORK(_work->work, mlx5e_vxlan_del_work);
 
vxlan_work->priv = priv;
vxlan_work->port = port;
-- 
2.17.0



[pull request][net-next V2 00/12] Mellanox, mlx5 updates 2018-07-27 (Vxlan updates)

2018-07-27 Thread Saeed Mahameed
Hi Dave,

This series from Gal and Saeed provides updates to mlx5 vxlan implementation.

For more information please see tag log below.

Please pull and let me know if there's any problem.

V1->V2:
 - Drop the rw lock patch.

Thanks,
Saeed.

--- 

The following changes since commit 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21:

  net: sched: don't dump chains only held by actions (2018-07-27 09:38:46 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-07-27

for you to fetch changes up to a3e673660bc3fca3e9e0cbab871b2fb100e9ed64:

  net/mlx5e: Issue direct lookup on vxlan ports by vport representors 
(2018-07-27 15:46:13 -0700)


mlx5e-updates-2018-07-27 (Vxlan updates)

This series from Gal and Saeed provides updates to mlx5 vxlan implementation.

Gal, started with three cleanups to reflect the actual hardware vxlan state
- reflect 4789 UDP port default addition to software database
- check maximum number of vxlan  UDP ports
- cleanup an unused member in vxlan work

Then Gal provides performance optimization by replacing the
vxlan radix tree with a hash table.

Measuring mlx5e_vxlan_lookup_port execution time:

  Radix Tree   Hash Table
 ---  
  Single Stream   161 ns   79  ns (51% improvement)
  Multi Stream259 ns   136 ns (47% improvement)

Measuring UDP stream packet rate, single fully utilized TX core:
Radix Tree: 498,300 PPS
Hash Table: 555,468 PPS (11% improvement)

Next, from Saeed, vxlan refactoring to allow sharing the vxlan table
between different mlx5 netdevice instances like PF and VF representors,
this is done by making mlx5 vxlan interface more generic and decoupling
it from PF netdevice structures and logic, then moving it into mlx5 core
as a low level interface so it can be used by VF representors, which is
illustrated in the last patch of the serious.

-Saeed.


Gal Pressman (4):
  net/mlx5e: Vxlan, reflect 4789 UDP port default addition to software 
database
  net/mlx5e: Vxlan, check maximum number of UDP ports
  net/mlx5e: Vxlan, replace ports radix-tree with hash table
  net/mlx5e: Vxlan, cleanup an unused member in vxlan work

Saeed Mahameed (8):
  net/mlx5e: Vxlan, add direct delete function
  net/mlx5e: Vxlan, move netdev only logic to en_main.c
  net/mlx5e: Vxlan, rename struct mlx5e_vxlan to mlx5_vxlan_port
  net/mlx5e: Vxlan, rename from mlx5e to mlx5
  net/mlx5e: Vxlan, return values for add/del port
  net/mlx5e: Vxlan, add sync lock for add/del vxlan port
  net/mlx5e: Vxlan, move vxlan logic to core driver
  net/mlx5e: Issue direct lookup on vxlan ports by vport representors

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   6 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  71 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  14 +-
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c| 230 +
 .../ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h  |  39 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c| 190 -
 include/linux/mlx5/driver.h|   2 +
 include/linux/mlx5/mlx5_ifc.h  |   4 +-
 10 files changed, 325 insertions(+), 240 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h (66%)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c


Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge

2018-07-27 Thread D'Souza, Nelson
David,

With Ubuntu 18.04.1 (kernel 4.15.0-29) pings sent out on test-vrf and br0 are 
successful.

# uname -rv
4.15.0-29-generic #31-Ubuntu SMP Tue Jul 17 15:39:52 UTC 2018

# ping -c 1 -I test-vrf 172.16.2.2
ping: Warning: source address might be selected on device other than test-vrf.
PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 test-vrf: 56(84) bytes of data.
64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.050 ms

--- 172.16.2.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.050/0.050/0.050/0.000 ms

# ping -c 1 -I br0 172.16.2.2
PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 br0: 56(84) bytes of data.
64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.026 ms

--- 172.16.2.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.026/0.026/0.026/0.000 ms

However, with Ubuntu 17.10.1 (kernel  4.13.0-21) pings on only test-vrf are 
successful. Pings on br0 are not successful.
So it seems like there maybe a change in versions after 4.13.0-21 that causes 
pings on br0 to pass.

Nelson

On 7/25/18, 5:35 PM, "D'Souza, Nelson"  wrote:

David, 

I tried out the commands on an Ubuntu 17.10.1 VM.
The pings on test-vrf are successful, but the pings on br0 are not 
successful.

# uname -rv  
4.13.0-21-generic #24-Ubuntu SMP Mon Dec 18 17:29:16 UTC 2017

 # lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:Ubuntu 17.10
Release:17.10
Codename:   artful

# ip rule  --> Note: its missing the l3mdev rule
0:  from all lookup local 
32766:  from all lookup main 
32767:  from all lookup default

Ran the configs from a bash script vrf.sh

 # ./vrf.sh 
+ ip netns add foo
+ ip li add veth1 type veth peer name veth2
+ ip li set veth2 netns foo
+ ip -netns foo li set lo up
+ ip -netns foo li set veth2 up
+ ip -netns foo addr add 172.16.1.2/24 dev veth2
+ ip li add test-vrf type vrf table 123
+ ip li set test-vrf up
+ ip ro add vrf test-vrf unreachable default
+ ip li add br0 type bridge
+ ip li set veth1 master br0
+ ip li set veth1 up
+ ip li set br0 up
+ ip addr add dev br0 172.16.1.1/24
+ ip li set br0 master test-vrf
+ ip -netns foo addr add 172.16.2.2/32 dev lo
+ ip ro add vrf test-vrf 172.16.2.2/32 via 172.16.1.2

# ping -I test-vrf 172.16.2.2 -c 2  <<< successful on test-vrf
ping: Warning: source address might be selected on device other than 
test-vrf.
PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 test-vrf: 56(84) bytes of data.
64 bytes from 172.16.2.2: icmp_seq=1 ttl=64 time=0.035 ms
64 bytes from 172.16.2.2: icmp_seq=2 ttl=64 time=0.045 ms

--- 172.16.2.2 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1022ms
rtt min/avg/max/mdev = 0.035/0.040/0.045/0.005 ms

#ping -I br0 172.16.2.2 -c 2   <<< fails on br0
PING 172.16.2.2 (172.16.2.2) from 172.16.1.1 br0: 56(84) bytes of data.

--- 172.16.2.2 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 1022ms

Please let me know if I should try a different version.

Nelson

On 7/24/18, 9:08 AM, "D'Souza, Nelson"  wrote:

It's strange that enslaving eth1 -> br0 -> test-vrf does not work, but 
enslaving eth1->test-vrf works fine.

Nelson

On 7/24/18, 8:58 AM, "D'Souza, Nelson"  wrote:

Thank you David, really appreciate the help. Most likely something 
specific to my environment.

ip vrf id, does not report anything on my system. Here's the result 
after running the command.

# ip vrf id
#

I'll follow up with a VM.

Nelson

On 7/24/18, 5:55 AM, "David Ahern"  wrote:

On 7/23/18 7:43 PM, D'Souza, Nelson wrote:
> I copy and pasted the configs onto my device, but pings on 
test-vrf do not work in my setup. 
> I'm essentially seeing the same issue as I reported before.
> 
> In this case, pings sent out on test-vrf (host ns) are 
received and replied to by the loopback interface (foo ns). Although the 
replies are seen at the test-vrf level, they are not locally delivered to the 
ping application.
> 

I just built v4.14.52 kernel and ran those commands - worked 
fine. It is
something specific to your environment. Is your shell tied to a 
VRF --
(ip vrf id)?

After that, I suggest you create a VM running a newer 
distribution of
your choice (Ubuntu 17.10 or newer, debian stretch with 4.14 
kernel, or
   

[PATCH bpf] tools/bpftool: fix a percpu_array map dump problem

2018-07-27 Thread Yonghong Song
I hit the following problem when I tried to use bpftool
to dump a percpu array.

  $ sudo ./bpftool map show
  61: percpu_array  name stub  flags 0x0
  key 4B  value 4B  max_entries 1  memlock 4096B
  ...
  $ sudo ./bpftool map dump id 61
  bpftool: malloc.c:2406: sysmalloc: Assertion
  `(old_top == initial_top (av) && old_size == 0) || \
   ((unsigned long) (old_size) >= MINSIZE && \
   prev_inuse (old_top) && \
   ((unsigned long) old_end & (pagesize - 1)) == 0)'
  failed.
  Aborted

Further debugging revealed that this is due to
miscommunication between bpftool and kernel.
For example, for the above percpu_array with value size of 4B.
The map info returned to user space has value size of 4B.

In bpftool, the values array for lookup is allocated like:
   info->value_size * get_possible_cpus() = 4 * get_possible_cpus()
In kernel (kernel/bpf/syscall.c), the values array size is
rounded up to multiple of 8.
   round_up(map->value_size, 8) * num_possible_cpus()
   = 8 * num_possible_cpus()
So when kernel copies the values to user buffer, the kernel will
overwrite beyond user buffer boundary.

This patch fixed the issue by allocating and stepping through
percpu map value array properly in bpftool.

Fixes: 71bb428fe2c19 ("tools: bpf: add bpftool")
Signed-off-by: Yonghong Song 
---
 tools/bpf/bpftool/map.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 0ee3ba479d87..92bc55f98c4c 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -91,7 +92,8 @@ static bool map_is_map_of_progs(__u32 type)
 static void *alloc_value(struct bpf_map_info *info)
 {
if (map_is_per_cpu(info->type))
-   return malloc(info->value_size * get_possible_cpus());
+   return malloc(round_up(info->value_size, 8) *
+ get_possible_cpus());
else
return malloc(info->value_size);
 }
@@ -273,9 +275,10 @@ static void print_entry_json(struct bpf_map_info *info, 
unsigned char *key,
do_dump_btf(, info, key, value);
}
} else {
-   unsigned int i, n;
+   unsigned int i, n, step;
 
n = get_possible_cpus();
+   step = round_up(info->value_size, 8);
 
jsonw_name(json_wtr, "key");
print_hex_data_json(key, info->key_size);
@@ -288,7 +291,7 @@ static void print_entry_json(struct bpf_map_info *info, 
unsigned char *key,
jsonw_int_field(json_wtr, "cpu", i);
 
jsonw_name(json_wtr, "value");
-   print_hex_data_json(value + i * info->value_size,
+   print_hex_data_json(value + i * step,
info->value_size);
 
jsonw_end_object(json_wtr);
-- 
2.14.3



[PATCH net-next] selftests: mlxsw: qos_dscp_bridge: Fix

2018-07-27 Thread Petr Machata
There are two problems in this test case:

- When indexing in bash associative array, the subscript is interpreted as
  string, not as a variable name to be expanded.

- The keys stored to t0s and t1s are not DSCP values, but priority +
  base (i.e. the logical DSCP value, not the full bitfield value).

In combination these two bugs conspire to make the test just work,
except it doesn't really test anything and always passes.

Fix the above two problems in obvious manner.

Signed-off-by: Petr Machata 
---
 tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh 
b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
index 418319f19108..cc527660a022 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
@@ -217,13 +217,13 @@ dscp_ping_test()
 
for key in ${!t0s[@]}; do
local expect
-   if ((key == dscp_10 || key == dscp_20)); then
+   if ((key == prio+10 || key == prio+20)); then
expect=10
else
expect=0
fi
 
-   local delta=$((t1s[key] - t0s[key]))
+   local delta=$((t1s[$key] - t0s[$key]))
((expect == delta))
check_err $? "DSCP $key: Expected to capture $expect packets, 
got $delta."
done
-- 
2.4.11



Re: [net-next 04/13] net/mlx5e: Vxlan, replace spinlock with read-write lock

2018-07-27 Thread Saeed Mahameed
On Fri, Jul 27, 2018 at 2:48 PM, Stephen Hemminger
 wrote:
> On Fri, 27 Jul 2018 14:15:09 -0700
> Saeed Mahameed  wrote:
>
>> From: Gal Pressman 
>>
>> The VXLAN database is mainly used by readers in data path, and rarely
>> used by control path writers.
>> Multiple readers (threads) should not block each other and cause an
>> unnecessary contention on the lock.
>>
>> Replacing the spinlock with rwlock optimizes the common use case where
>> adding ports to the table (adding VXLAN interfaces) is quite rare, but
>> the table is accessed for each VXLAN TX skb.
>>
>> Signed-off-by: Gal Pressman 
>> Signed-off-by: Saeed Mahameed 
>
> Did you know that for small sections a spinlock is significantly faster than
> a reader-writer lock. It turns out that reader-writer locks the reader
> creates a cache line bounce.
>
> https://www.kernel.org/doc/Documentation/locking/spinlocks.txt
>
>
> Lesson 2: reader-writer spinlocks.
>
> If your data accesses have a very natural pattern where you usually tend
> to mostly read from the shared variables, the reader-writer locks
> (rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
> readers to be in the same critical region at once, but if somebody wants
> to change the variables it has to get an exclusive write lock.
>
>NOTE! reader-writer locks require more atomic memory operations than
>simple spinlocks.  Unless the reader critical section is long, you
>are better off just using spinlocks.

Thanks Stephen, very usefull information !
I will drop this patch for now, I will consider  using rcu lock
instead in a future patch.


[PATCH iproute2-next] sch_cake: Make gso-splitting configurable

2018-07-27 Thread Dave Taht
This patch makes sch_cake's gso/gro splitting configurable
from userspace.

To disable breaking apart superpackets in sch_cake:

tc qdisc replace dev whatever root cake no-split-gso

to enable:

tc qdisc replace dev whatever root cake split-gso

Signed-off-by: Toke Høiland-Jørgensen 
Signed-off-by: Dave Taht 

---
 tc/q_cake.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tc/q_cake.c b/tc/q_cake.c
index f1e232a..727d673 100644
--- a/tc/q_cake.c
+++ b/tc/q_cake.c
@@ -79,6 +79,7 @@ static void explain(void)
 "  dual-srchost | dual-dsthost | triple-isolate* ]\n"
 "[ nat | nonat* ]\n"
 "[ wash | nowash* ]\n"
+"[ split-gso* | no-split-gso ]\n"
 "[ ack-filter | ack-filter-aggressive | no-ack-filter* ]\n"
 "[ memlimit LIMIT ]\n"
 "[ ptm | atm | noatm* ] [ overhead N | conservative | raw* ]\n"
@@ -108,6 +109,7 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, 
char **argv,
int nat = -1;
int atm = -1;
int mpu = 0;
+   int split_gso = -1;
 
while (argc > 0) {
if (strcmp(*argv, "bandwidth") == 0) {
@@ -155,6 +157,10 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, 
char **argv,
wash = 0;
} else if (strcmp(*argv, "wash") == 0) {
wash = 1;
+   } else if (strcmp(*argv, "split-gso") == 0) {
+   split_gso = 1;
+   } else if (strcmp(*argv, "no-split-gso") == 0) {
+   split_gso = 0;
} else if (strcmp(*argv, "flowblind") == 0) {
flowmode = CAKE_FLOW_NONE;
} else if (strcmp(*argv, "srchost") == 0) {
@@ -374,6 +380,9 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, 
char **argv,
addattr_l(n, 1024, TCA_CAKE_NAT, , sizeof(nat));
if (wash != -1)
addattr_l(n, 1024, TCA_CAKE_WASH, , sizeof(wash));
+   if (split_gso != -1)
+   addattr_l(n, 1024, TCA_CAKE_SPLIT_GSO, _gso,
+ sizeof(split_gso));
if (ingress != -1)
addattr_l(n, 1024, TCA_CAKE_INGRESS, , sizeof(ingress));
if (ack_filter != -1)
-- 
2.7.4



Re: [net-next 04/13] net/mlx5e: Vxlan, replace spinlock with read-write lock

2018-07-27 Thread Stephen Hemminger
On Fri, 27 Jul 2018 14:15:09 -0700
Saeed Mahameed  wrote:

> From: Gal Pressman 
> 
> The VXLAN database is mainly used by readers in data path, and rarely
> used by control path writers.
> Multiple readers (threads) should not block each other and cause an
> unnecessary contention on the lock.
> 
> Replacing the spinlock with rwlock optimizes the common use case where
> adding ports to the table (adding VXLAN interfaces) is quite rare, but
> the table is accessed for each VXLAN TX skb.
> 
> Signed-off-by: Gal Pressman 
> Signed-off-by: Saeed Mahameed 

Did you know that for small sections a spinlock is significantly faster than
a reader-writer lock. It turns out that reader-writer locks the reader
creates a cache line bounce.

https://www.kernel.org/doc/Documentation/locking/spinlocks.txt


Lesson 2: reader-writer spinlocks.

If your data accesses have a very natural pattern where you usually tend
to mostly read from the shared variables, the reader-writer locks
(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
readers to be in the same critical region at once, but if somebody wants
to change the variables it has to get an exclusive write lock.

   NOTE! reader-writer locks require more atomic memory operations than
   simple spinlocks.  Unless the reader critical section is long, you
   are better off just using spinlocks.


[PATCH net] tcp_bbr: fix bw probing to raise in-flight data for very small BDPs

2018-07-27 Thread Neal Cardwell
For some very small BDPs (with just a few packets) there was a
quantization effect where the target number of packets in flight
during the super-unity-gain (1.25x) phase of gain cycling was
implicitly truncated to a number of packets no larger than the normal
unity-gain (1.0x) phase of gain cycling. This meant that in multi-flow
scenarios some flows could get stuck with a lower bandwidth, because
they did not push enough packets inflight to discover that there was
more bandwidth available. This was really only an issue in multi-flow
LAN scenarios, where RTTs and BDPs are low enough for this to be an
issue.

This fix ensures that gain cycling can raise inflight for small BDPs
by ensuring that in PROBE_BW mode target inflight values with a
super-unity gain are always greater than inflight values with a gain
<= 1. Importantly, this applies whether the inflight value is
calculated for use as a cwnd value, or as a target inflight value for
the end of the super-unity phase in bbr_is_next_cycle_phase() (both
need to be bigger to ensure we can probe with more packets in flight
reliably).

This is a candidate fix for stable releases.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell 
Acked-by: Yuchung Cheng 
Acked-by: Soheil Hassas Yeganeh 
Acked-by: Priyaranjan Jha 
Reviewed-by: Eric Dumazet 
---
 net/ipv4/tcp_bbr.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 3b5f45b9e81eb..13d34427ca3dd 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -358,6 +358,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int 
gain)
/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
cwnd = (cwnd + 1) & ~1U;
 
+   /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+   if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+   cwnd += 2;
+
return cwnd;
 }
 
-- 
2.18.0.345.g5c9ce644c3-goog



[net-next 10/13] net/mlx5e: Vxlan, return values for add/del port

2018-07-27 Thread Saeed Mahameed
For a better API mlx5_vxlan_{add/del}_port can fail, make them return
error values.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 28 +--
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  4 +--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 502af3bdf088..f5353134542d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -104,29 +104,34 @@ struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct 
mlx5_vxlan *vxlan, u16 por
return vxlanp;
 }
 
-void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
 {
struct mlx5_vxlan_port *vxlanp;
+   int ret = -ENOSPC;
 
vxlanp = mlx5_vxlan_lookup_port(vxlan, port);
if (vxlanp) {
atomic_inc(>refcount);
-   return;
+   return 0;
}
 
if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
mlx5_core_info(vxlan->mdev,
   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
   port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
-   return;
+   ret = -ENOSPC;
+   return ret;
}
 
-   if (mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port))
-   return;
+   ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
+   if (ret)
+   return ret;
 
vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
-   if (!vxlanp)
+   if (!vxlanp) {
+   ret = -ENOMEM;
goto err_delete_port;
+   }
 
vxlanp->udp_port = port;
atomic_set(>refcount, 1);
@@ -136,21 +141,25 @@ void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
write_unlock_bh(>lock);
 
vxlan->num_ports++;
-   return;
+   return 0;
 
 err_delete_port:
mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+   return ret;
 }
 
-void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
 {
struct mlx5_vxlan_port *vxlanp;
bool remove = false;
+   int ret = 0;
 
write_lock_bh(>lock);
vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
-   if (!vxlanp)
+   if (!vxlanp) {
+   ret = -ENOENT;
goto out_unlock;
+   }
 
if (atomic_dec_and_test(>refcount)) {
hash_del(>hlist);
@@ -165,6 +174,7 @@ void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
kfree(vxlanp);
vxlan->num_ports--;
}
+   return ret;
 }
 
 struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
index 9d6327321814..fd874a30c4d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -49,8 +49,8 @@ static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan 
*vxlan)
 
 struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
 void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
-void mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
-void mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
 struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 
port);
 
 #else
-- 
2.17.0



[net-next 02/13] net/mlx5e: Vxlan, check maximum number of UDP ports

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The NIC has a limited number of offloaded VXLAN UDP ports (usually 4).
Instead of letting the firmware fail when trying to add more ports than
it can handle, let the driver check it on its own.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|  1 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 14 ++
 include/linux/mlx5/mlx5_ifc.h   |  4 +++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c41cfc2a4b70..c4d4db8722f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -657,6 +657,7 @@ enum {
 struct mlx5e_vxlan_db {
spinlock_t  lock; /* protect vxlan table */
struct radix_tree_root  tree;
+   int num_ports;
 };
 
 struct mlx5e_l2_rule {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2f68d13e..e3af2efe18ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -52,6 +52,11 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
mlx5e_vxlan_add_port(priv, 4789);
 }
 
+static inline u8 mlx5e_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
+{
+   return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
+}
+
 static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
 {
u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
@@ -98,6 +103,13 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
return;
}
 
+   if (vxlan_db->num_ports >= mlx5e_vxlan_max_udp_ports(priv->mdev)) {
+   netdev_info(priv->netdev,
+   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
+   port, mlx5e_vxlan_max_udp_ports(priv->mdev));
+   return;
+   }
+
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
return;
 
@@ -114,6 +126,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
if (err)
goto err_free;
 
+   vxlan_db->num_ports++;
return;
 
 err_free:
@@ -163,6 +176,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
if (remove) {
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
kfree(vxlan);
+   vxlan_db->num_ports--;
}
mutex_unlock(>state_lock);
kfree(vxlan_work);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 22f54bedfaae..60c2308fe062 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -668,7 +668,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
u8 swp[0x1];
u8 swp_csum[0x1];
u8 swp_lso[0x1];
-   u8 reserved_at_23[0x1b];
+   u8 reserved_at_23[0xd];
+   u8 max_vxlan_udp_ports[0x8];
+   u8 reserved_at_38[0x6];
u8 max_geneve_opt_len[0x1];
u8 tunnel_stateless_geneve_rx[0x1];
 
-- 
2.17.0



[net-next 06/13] net/mlx5e: Vxlan, add direct delete function

2018-07-27 Thread Saeed Mahameed
Add direct vxlan delete function to be called from vxlan_delete_work.
Needed in downstream patch.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c| 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 4a86d8132fc1..64883ca451aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -158,17 +158,12 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
kfree(vxlan_work);
 }
 
-static void mlx5e_vxlan_del_work(struct work_struct *work)
+static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   u16 port = vxlan_work->port;
struct mlx5e_vxlan *vxlan;
bool remove = false;
 
-   mutex_lock(>state_lock);
write_lock_bh(_db->lock);
vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
if (!vxlan)
@@ -187,6 +182,17 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
kfree(vxlan);
vxlan_db->num_ports--;
}
+}
+
+static void mlx5e_vxlan_del_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_del_port(priv, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
-- 
2.17.0



[net-next 08/13] net/mlx5e: Vxlan, rename struct mlx5e_vxlan to mlx5_vxlan_port

2018-07-27 Thread Saeed Mahameed
The name mlx5e_vxlan will be used in downstream patch to describe
mlx5 vxlan structure that will replace mlx5e_vxlan_db.

Hence we rename struct mlx5e_vxlan to mlx5_vxlan_port which describes a
mlx5 vxlan port.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 62 +--
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  4 +-
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2a25c2dd6e6f..95e3140da65c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,7 +36,7 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
-struct mlx5e_vxlan {
+struct mlx5_vxlan_port {
struct hlist_node hlist;
atomic_t refcount;
u16 udp_port;
@@ -83,40 +83,40 @@ static int mlx5e_vxlan_core_del_port_cmd(struct 
mlx5_core_dev *mdev, u16 port)
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 }
 
-static struct mlx5e_vxlan *mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv 
*priv,
- u16 port)
+static struct mlx5_vxlan_port*
+mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan*vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
-   hash_for_each_possible(vxlan_db->htable, vxlan, hlist, port) {
-   if (vxlan->udp_port == port)
-   return vxlan;
+   hash_for_each_possible(vxlan_db->htable, vxlanp, hlist, port) {
+   if (vxlanp->udp_port == port)
+   return vxlanp;
}
 
return NULL;
 }
 
-struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
+struct mlx5_vxlan_port *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 
port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
read_lock_bh(_db->lock);
-   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
+   vxlanp = mlx5e_vxlan_lookup_port_locked(priv, port);
read_unlock_bh(_db->lock);
 
-   return vxlan;
+   return vxlanp;
 }
 
 void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
 
-   vxlan = mlx5e_vxlan_lookup_port(priv, port);
-   if (vxlan) {
-   atomic_inc(>refcount);
+   vxlanp = mlx5e_vxlan_lookup_port(priv, port);
+   if (vxlanp) {
+   atomic_inc(>refcount);
return;
}
 
@@ -130,15 +130,15 @@ void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 
port)
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
return;
 
-   vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
-   if (!vxlan)
+   vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
+   if (!vxlanp)
goto err_delete_port;
 
-   vxlan->udp_port = port;
-   atomic_set(>refcount, 1);
+   vxlanp->udp_port = port;
+   atomic_set(>refcount, 1);
 
write_lock_bh(_db->lock);
-   hash_add(vxlan_db->htable, >hlist, port);
+   hash_add(vxlan_db->htable, >hlist, port);
write_unlock_bh(_db->lock);
 
vxlan_db->num_ports++;
@@ -151,16 +151,16 @@ void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 
port)
 void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
bool remove = false;
 
write_lock_bh(_db->lock);
-   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
-   if (!vxlan)
+   vxlanp = mlx5e_vxlan_lookup_port_locked(priv, port);
+   if (!vxlanp)
goto out_unlock;
 
-   if (atomic_dec_and_test(>refcount)) {
-   hash_del(>hlist);
+   if (atomic_dec_and_test(>refcount)) {
+   hash_del(>hlist);
remove = true;
}
 
@@ -169,7 +169,7 @@ void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 
if (remove) {
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
-   kfree(vxlan);
+   kfree(vxlanp);
vxlan_db->num_ports--;
}
 }
@@ -177,14 +177,14 @@ void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 
port)
 void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   struct mlx5e_vxlan *vxlan;
+   struct mlx5_vxlan_port *vxlanp;
struct hlist_node *tmp;
int bkt;
 
/* Lockless since we are the only hash table consumers, wq and TX are 
disabled */
-   hash_for_each_safe(vxlan_db->htable, bkt, tmp, vxlan, 

[net-next 09/13] net/mlx5e: Vxlan, rename from mlx5e to mlx5

2018-07-27 Thread Saeed Mahameed
Rename vxlan functions from mlx5e_vxlan_* to mlx5_vxlan_*.
Rename mlx5e_vxlan_db to mlx5_vxlan and move it from en.h to vxlan.c
since it is not related to mlx5e anymore.

Allocate mlx5_vxlan structure dynamically in order to make it easier to
move later to core driver and to make it private in vxlan.c.

This is in preparation to move vxlan API to mlx5 core.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  21 ++--
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |   4 +-
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 118 ++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  30 +++--
 5 files changed, 104 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 870ac617550c..1bd4536b9061 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,6 +52,7 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
+#include "vxlan.h"
 
 struct page_pool;
 
@@ -654,13 +655,6 @@ enum {
MLX5E_STATE_DESTROYING,
 };
 
-struct mlx5e_vxlan_db {
-   rwlock_tlock; /* protect vxlan table */
-   /* max_num_ports is usuallly 4, 16 buckets is more than enough */
-   DECLARE_HASHTABLE(htable, 4);
-   int num_ports;
-};
-
 struct mlx5e_l2_rule {
u8  addr[ETH_ALEN + 2];
struct mlx5_flow_handle *rule;
@@ -818,7 +812,7 @@ struct mlx5e_priv {
u32tx_rates[MLX5E_MAX_NUM_SQS];
 
struct mlx5e_flow_steering fs;
-   struct mlx5e_vxlan_db  vxlan;
+   struct mlx5_vxlan  *vxlan;
 
struct workqueue_struct*wq;
struct work_struct update_carrier_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7a6b78e3b5f7..ef4b2f0c427c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev)
mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
mutex_unlock(>state_lock);
 
-   if (mlx5e_vxlan_allowed(priv->mdev))
+   if (mlx5_vxlan_allowed(priv->vxlan))
udp_tunnel_get_rx_info(netdev);
 
return err;
@@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5e_vxlan_add_port(priv, port);
+   mlx5_vxlan_add_port(priv->vxlan, port);
mutex_unlock(>state_lock);
 
kfree(vxlan_work);
@@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5e_vxlan_del_port(priv, port);
+   mlx5_vxlan_del_port(priv->vxlan, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
@@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5e_vxlan_allowed(priv->mdev))
+   if (!mlx5_vxlan_allowed(priv->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
@@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5e_vxlan_allowed(priv->mdev))
+   if (!mlx5_vxlan_allowed(priv->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
@@ -4076,7 +4076,7 @@ static netdev_features_t 
mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
port = be16_to_cpu(udph->dest);
 
/* Verify if UDP port is being offloaded by HW */
-   if (mlx5e_vxlan_lookup_port(priv, port))
+   if (mlx5_vxlan_lookup_port(priv->vxlan, port))
return features;
}
 
@@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)
netdev->hw_features  |= NETIF_F_HW_VLAN_CTAG_FILTER;
netdev->hw_features  |= NETIF_F_HW_VLAN_STAG_TX;
 
-   if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, 
tunnel_stateless_gre)) {
+   if (mlx5_vxlan_allowed(priv->vxlan) || MLX5_CAP_ETH(mdev, 
tunnel_stateless_gre)) {
netdev->hw_enc_features |= NETIF_F_IP_CSUM;
netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
netdev->hw_enc_features |= NETIF_F_TSO;
@@ -4656,7 +4656,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)
netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
}
 
-   if (mlx5e_vxlan_allowed(mdev)) {
+   if 

[net-next 01/13] net/mlx5e: Vxlan, reflect 4789 UDP port default addition to software database

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The hardware offloads 4789 UDP port (default VXLAN port) automatically.
Add it to the software database as well in order to reflect the hardware
state appropriately.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 40 +--
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2f74953e4561..2f68d13e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,12 +36,20 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
+static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+
 void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
 
spin_lock_init(_db->lock);
INIT_RADIX_TREE(_db->tree, GFP_ATOMIC);
+
+   if (mlx5e_vxlan_allowed(priv->mdev))
+   /* Hardware adds 4789 by default.
+* Lockless since we are the only hash table consumers, wq and 
TX are disabled.
+*/
+   mlx5e_vxlan_add_port(priv, 4789);
 }
 
 static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
@@ -78,25 +86,20 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct 
mlx5e_priv *priv, u16 port)
return vxlan;
 }
 
-static void mlx5e_vxlan_add_port(struct work_struct *work)
+static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
-   u16 port = vxlan_work->port;
struct mlx5e_vxlan *vxlan;
int err;
 
-   mutex_lock(>state_lock);
vxlan = mlx5e_vxlan_lookup_port(priv, port);
if (vxlan) {
atomic_inc(>refcount);
-   goto free_work;
+   return;
}
 
if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
-   goto free_work;
+   return;
 
vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
if (!vxlan)
@@ -111,18 +114,29 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
if (err)
goto err_free;
 
-   goto free_work;
+   return;
 
 err_free:
kfree(vxlan);
 err_delete_port:
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
-free_work:
+}
+
+static void mlx5e_vxlan_add_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_add_port(priv, port);
mutex_unlock(>state_lock);
+
kfree(vxlan_work);
 }
 
-static void mlx5e_vxlan_del_port(struct work_struct *work)
+static void mlx5e_vxlan_del_work(struct work_struct *work)
 {
struct mlx5e_vxlan_work *vxlan_work =
container_of(work, struct mlx5e_vxlan_work, work);
@@ -164,9 +178,9 @@ void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, 
sa_family_t sa_family,
return;
 
if (add)
-   INIT_WORK(_work->work, mlx5e_vxlan_add_port);
+   INIT_WORK(_work->work, mlx5e_vxlan_add_work);
else
-   INIT_WORK(_work->work, mlx5e_vxlan_del_port);
+   INIT_WORK(_work->work, mlx5e_vxlan_del_work);
 
vxlan_work->priv = priv;
vxlan_work->port = port;
-- 
2.17.0



Re: [PATCH net] net/mlx5e: Move mlx5e_priv_flags into en_ethtool.c

2018-07-27 Thread Saeed Mahameed
On Fri, Jul 27, 2018 at 8:59 AM, Kamal Heib  wrote:
> On Thu, Jul 26, 2018 at 11:51:49AM -0700, Saeed Mahameed wrote:
>> On Sun, Jul 15, 2018 at 12:06 PM, Kamal Heib  wrote:
>> > Move the definition of mlx5e_priv_flags into en_ethtool.c because it's
>> > only used there.
>> >
>> > Fixes: 4e59e2888139 ("net/mlx5e: Introduce net device priv flags 
>> > infrastructure")
>> > Signed-off-by: Kamal Heib 
>> > ---
>> >  drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 ---
>> >  drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 +++
>> >  2 files changed, 7 insertions(+), 7 deletions(-)
>> >
>> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
>> > b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> > index eb9eb7aa953a..84e6a5b42286 100644
>> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> > @@ -208,13 +208,6 @@ struct mlx5e_umr_wqe {
>> >
>> >  extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
>> >
>> > -static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
>> > -   "rx_cqe_moder",
>> > -   "tx_cqe_moder",
>> > -   "rx_cqe_compress",
>> > -   "rx_striding_rq",
>> > -};
>> > -
>>
>> Hi Kamal, on a second thought, i would like to drop this change and
>> keep mlx5e_priv_flags close/local to the below mlx5e_priv_flag.
>>
>> Please let me know.
>>
>
> Hi,
>
> Basically this change came to avoid the following warning when compiling
> the mlx5 driver with "W=1" flag and this error will appear for each
> file that include the "en.h".
>
> So, I suggest not drop this change.
>
> In file included from drivers/net/ethernet/mellanox/mlx5/core//en_main.c:40:0:
> drivers/net/ethernet/mellanox/mlx5/core//en.h:206:19: warning: 
> ‘mlx5e_priv_flags’ defined but not used [-Wunused-const-variable=]
>  static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
>

Ok Kamal,
Patch applied to net-next-mlx5.

Thank you !


[net-next 13/13] net/mlx5e: Issue direct lookup on vxlan ports by vport representors

2018-07-27 Thread Saeed Mahameed
Remove uplink representor netdevice private structure lookup, and use
mlx5 core handle directly from representor private structure to lookup
vxlan ports.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 288a57f76e84..c28fe469b04a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1124,16 +1124,12 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
skb_flow_dissector_target(f->dissector,
  FLOW_DISSECTOR_KEY_ENC_PORTS,
  f->mask);
-   struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-   struct mlx5e_rep_priv *uplink_rpriv = 
mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
-   struct net_device *up_dev = uplink_rpriv->netdev;
-   struct mlx5e_priv *up_priv = netdev_priv(up_dev);
 
/* Full udp dst port must be given */
if (memchr_inv(>dst, 0xff, sizeof(mask->dst)))
goto vxlan_match_offload_err;
 
-   if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, 
be16_to_cpu(key->dst)) &&
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, 
be16_to_cpu(key->dst)) &&
MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap))
parse_vxlan_attr(spec, f);
else {
@@ -2533,11 +2529,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
  struct mlx5e_tc_flow *flow)
 {
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-   struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw,
-  
REP_ETH);
-   struct net_device *up_dev = uplink_rpriv->netdev;
unsigned short family = ip_tunnel_info_af(tun_info);
-   struct mlx5e_priv *up_priv = netdev_priv(up_dev);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct ip_tunnel_key *key = _info->key;
struct mlx5e_encap_entry *e;
@@ -2557,7 +2549,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
return -EOPNOTSUPP;
}
 
-   if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, 
be16_to_cpu(key->tp_dst)) &&
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) 
&&
MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
tunnel_type = MLX5_HEADER_TYPE_VXLAN;
} else {
-- 
2.17.0



[net-next 04/13] net/mlx5e: Vxlan, replace spinlock with read-write lock

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The VXLAN database is mainly used by readers in data path, and rarely
used by control path writers.
Multiple readers (threads) should not block each other and cause an
unnecessary contention on the lock.

Replacing the spinlock with rwlock optimizes the common use case where
adding ports to the table (adding VXLAN interfaces) is quite rare, but
the table is accessed for each VXLAN TX skb.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6878925c3abf..870ac617550c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -655,7 +655,7 @@ enum {
 };
 
 struct mlx5e_vxlan_db {
-   spinlock_t  lock; /* protect vxlan table */
+   rwlock_tlock; /* protect vxlan table */
/* max_num_ports is usuallly 4, 16 buckets is more than enough */
DECLARE_HASHTABLE(htable, 4);
int num_ports;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 3c0ea9bc20e3..2733ca63e46b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -42,7 +42,7 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
 
-   spin_lock_init(_db->lock);
+   rwlock_init(_db->lock);
hash_init(vxlan_db->htable);
 
if (mlx5e_vxlan_allowed(priv->mdev))
@@ -98,9 +98,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv 
*priv, u16 port)
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
 
-   spin_lock_bh(_db->lock);
+   read_lock_bh(_db->lock);
vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
-   spin_unlock_bh(_db->lock);
+   read_unlock_bh(_db->lock);
 
return vxlan;
 }
@@ -133,9 +133,9 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
vxlan->udp_port = port;
atomic_set(>refcount, 1);
 
-   spin_lock_bh(_db->lock);
+   write_lock_bh(_db->lock);
hash_add(vxlan_db->htable, >hlist, port);
-   spin_unlock_bh(_db->lock);
+   write_unlock_bh(_db->lock);
 
vxlan_db->num_ports++;
return;
@@ -169,7 +169,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
bool remove = false;
 
mutex_lock(>state_lock);
-   spin_lock_bh(_db->lock);
+   write_lock_bh(_db->lock);
vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
if (!vxlan)
goto out_unlock;
@@ -180,7 +180,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
}
 
 out_unlock:
-   spin_unlock_bh(_db->lock);
+   write_unlock_bh(_db->lock);
 
if (remove) {
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
-- 
2.17.0



[net-next 12/13] net/mlx5e: Vxlan, move vxlan logic to core driver

2018-07-27 Thread Saeed Mahameed
Move vxlan logic and objects to mlx5 core dirver.
Since it going to be used from different mlx5 interfaces.
e.g. mlx5e PF NIC netdev and mlx5e E-Switch representors.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 --
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 21 ---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |  6 +++---
 .../mellanox/mlx5/core/{ => lib}/vxlan.c  |  0
 .../mellanox/mlx5/core/{ => lib}/vxlan.h  |  0
 .../net/ethernet/mellanox/mlx5/core/main.c|  5 +
 include/linux/mlx5/driver.h   |  2 ++
 8 files changed, 21 insertions(+), 19 deletions(-)
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.c (100%)
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h (100%)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index ae2bdcb1647c..f20fda1ced4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
-   en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \
-   en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
+   en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
+   en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o lib/vxlan.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1bd4536b9061..c7ed3d20fd54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,7 +52,6 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
-#include "vxlan.h"
 
 struct page_pool;
 
@@ -812,7 +811,6 @@ struct mlx5e_priv {
u32tx_rates[MLX5E_MAX_NUM_SQS];
 
struct mlx5e_flow_steering fs;
-   struct mlx5_vxlan  *vxlan;
 
struct workqueue_struct*wq;
struct work_struct update_carrier_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ef4b2f0c427c..fde35021a257 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -45,7 +45,7 @@
 #include "en_accel/tls.h"
 #include "accel/ipsec.h"
 #include "accel/tls.h"
-#include "vxlan.h"
+#include "lib/vxlan.h"
 #include "en/port.h"
 #include "en/xdp.h"
 
@@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev)
mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
mutex_unlock(>state_lock);
 
-   if (mlx5_vxlan_allowed(priv->vxlan))
+   if (mlx5_vxlan_allowed(priv->mdev->vxlan))
udp_tunnel_get_rx_info(netdev);
 
return err;
@@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5_vxlan_add_port(priv->vxlan, port);
+   mlx5_vxlan_add_port(priv->mdev->vxlan, port);
mutex_unlock(>state_lock);
 
kfree(vxlan_work);
@@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
u16 port = vxlan_work->port;
 
mutex_lock(>state_lock);
-   mlx5_vxlan_del_port(priv->vxlan, port);
+   mlx5_vxlan_del_port(priv->mdev->vxlan, port);
mutex_unlock(>state_lock);
kfree(vxlan_work);
 }
@@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5_vxlan_allowed(priv->vxlan))
+   if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
@@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
return;
 
-   if (!mlx5_vxlan_allowed(priv->vxlan))
+   if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
return;
 
mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
@@ -4076,7 +4076,7 @@ static netdev_features_t 
mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
port = be16_to_cpu(udph->dest);
 
/* Verify if UDP port is being offloaded by HW */
-   if (mlx5_vxlan_lookup_port(priv->vxlan, port))
+   if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port))
return features;
}
 
@@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device 
*netdev)

[net-next 11/13] net/mlx5e: Vxlan, add sync lock for add/del vxlan port

2018-07-27 Thread Saeed Mahameed
Vxlan API can and will be called from different mlx5 modules, we should
not count on mlx5e private state lock only, hence we introduce a vxlan
private mutex to sync between add/del vxlan port operations.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c| 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index f5353134542d..c126a790234d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -38,8 +38,9 @@
 
 struct mlx5_vxlan {
struct mlx5_core_dev*mdev;
-   rwlock_tlock; /* protect vxlan table */
int num_ports;
+   struct mutexsync_lock; /* sync add/del port HW 
operations */
+   rwlock_tlock; /* sync vxlan table with data 
path access */
/* max_num_ports is usuallly 4, 16 buckets is more than enough */
DECLARE_HASHTABLE(htable, 4);
 };
@@ -115,17 +116,18 @@ int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
return 0;
}
 
+   mutex_lock(>sync_lock);
if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
mlx5_core_info(vxlan->mdev,
   "UDP port (%d) not offloaded, max number of UDP 
ports (%d) are already offloaded\n",
   port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
ret = -ENOSPC;
-   return ret;
+   goto unlock;
}
 
ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
if (ret)
-   return ret;
+   goto unlock;
 
vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
if (!vxlanp) {
@@ -141,10 +143,14 @@ int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 
port)
write_unlock_bh(>lock);
 
vxlan->num_ports++;
+   mutex_unlock(>sync_lock);
return 0;
 
 err_delete_port:
mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+
+unlock:
+   mutex_unlock(>sync_lock);
return ret;
 }
 
@@ -154,6 +160,8 @@ int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
bool remove = false;
int ret = 0;
 
+   mutex_lock(>sync_lock);
+
write_lock_bh(>lock);
vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
if (!vxlanp) {
@@ -174,6 +182,9 @@ int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
kfree(vxlanp);
vxlan->num_ports--;
}
+
+   mutex_unlock(>sync_lock);
+
return ret;
 }
 
@@ -189,6 +200,7 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev 
*mdev)
return ERR_PTR(-ENOMEM);
 
vxlan->mdev = mdev;
+   mutex_init(>sync_lock);
rwlock_init(>lock);
hash_init(vxlan->htable);
 
-- 
2.17.0



[net-next 07/13] net/mlx5e: Vxlan, move netdev only logic to en_main.c

2018-07-27 Thread Saeed Mahameed
Create a direct vxlan API to add and delete vxlan ports from HW.
+void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port);

And move vxlan_add/del_work to en_main.c since they are netdev only
logic.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 51 +
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 55 +++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   | 16 +-
 3 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 14a201cbb0a4..7a6b78e3b5f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3969,6 +3969,57 @@ static int mlx5e_get_vf_stats(struct net_device *dev,
 }
 #endif
 
+struct mlx5e_vxlan_work {
+   struct work_struct  work;
+   struct mlx5e_priv   *priv;
+   u16 port;
+};
+
+static void mlx5e_vxlan_add_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_add_port(priv, port);
+   mutex_unlock(>state_lock);
+
+   kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_del_work(struct work_struct *work)
+{
+   struct mlx5e_vxlan_work *vxlan_work =
+   container_of(work, struct mlx5e_vxlan_work, work);
+   struct mlx5e_priv *priv = vxlan_work->priv;
+   u16 port = vxlan_work->port;
+
+   mutex_lock(>state_lock);
+   mlx5e_vxlan_del_port(priv, port);
+   mutex_unlock(>state_lock);
+   kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
+{
+   struct mlx5e_vxlan_work *vxlan_work;
+
+   vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
+   if (!vxlan_work)
+   return;
+
+   if (add)
+   INIT_WORK(_work->work, mlx5e_vxlan_add_work);
+   else
+   INIT_WORK(_work->work, mlx5e_vxlan_del_work);
+
+   vxlan_work->priv = priv;
+   vxlan_work->port = port;
+   queue_work(priv->wq, _work->work);
+}
+
 static void mlx5e_add_vxlan_port(struct net_device *netdev,
 struct udp_tunnel_info *ti)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 64883ca451aa..2a25c2dd6e6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -36,7 +36,11 @@
 #include "mlx5_core.h"
 #include "vxlan.h"
 
-static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port);
+struct mlx5e_vxlan {
+   struct hlist_node hlist;
+   atomic_t refcount;
+   u16 udp_port;
+};
 
 void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 {
@@ -105,7 +109,7 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct 
mlx5e_priv *priv, u16 port)
return vxlan;
 }
 
-static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
+void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
@@ -144,21 +148,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 }
 
-static void mlx5e_vxlan_add_work(struct work_struct *work)
-{
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
-   u16 port = vxlan_work->port;
-
-   mutex_lock(>state_lock);
-   mlx5e_vxlan_add_port(priv, port);
-   mutex_unlock(>state_lock);
-
-   kfree(vxlan_work);
-}
-
-static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
+void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
@@ -184,37 +174,6 @@ static void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, 
u16 port)
}
 }
 
-static void mlx5e_vxlan_del_work(struct work_struct *work)
-{
-   struct mlx5e_vxlan_work *vxlan_work =
-   container_of(work, struct mlx5e_vxlan_work, work);
-   struct mlx5e_priv *priv = vxlan_work->priv;
-   u16 port = vxlan_work->port;
-
-   mutex_lock(>state_lock);
-   mlx5e_vxlan_del_port(priv, port);
-   mutex_unlock(>state_lock);
-   kfree(vxlan_work);
-}
-
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
-{
-   struct mlx5e_vxlan_work *vxlan_work;
-
-   vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
-   if (!vxlan_work)
-   return;
-

[net-next 05/13] net/mlx5e: Vxlan, cleanup an unused member in vxlan work

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

Cleanup the sa_family member of the vxlan work, it is unused/needed
anywhere in the code.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c   | 4 +---
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h   | 4 +---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fad947079a43..14a201cbb0a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3980,7 +3980,7 @@ static void mlx5e_add_vxlan_port(struct net_device 
*netdev,
if (!mlx5e_vxlan_allowed(priv->mdev))
return;
 
-   mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1);
+   mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
 }
 
 static void mlx5e_del_vxlan_port(struct net_device *netdev,
@@ -3994,7 +3994,7 @@ static void mlx5e_del_vxlan_port(struct net_device 
*netdev,
if (!mlx5e_vxlan_allowed(priv->mdev))
return;
 
-   mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
+   mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
 }
 
 static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2733ca63e46b..4a86d8132fc1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -191,8 +191,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
kfree(vxlan_work);
 }
 
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
-   u16 port, int add)
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
 {
struct mlx5e_vxlan_work *vxlan_work;
 
@@ -207,7 +206,6 @@ void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, 
sa_family_t sa_family,
 
vxlan_work->priv = priv;
vxlan_work->port = port;
-   vxlan_work->sa_family = sa_family;
queue_work(priv->wq, _work->work);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
index 52c41c22235d..51f19e3e5784 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -44,7 +44,6 @@ struct mlx5e_vxlan {
 struct mlx5e_vxlan_work {
struct work_struct  work;
struct mlx5e_priv   *priv;
-   sa_family_t sa_family;
u16 port;
 };
 
@@ -57,8 +56,7 @@ static inline bool mlx5e_vxlan_allowed(struct mlx5_core_dev 
*mdev)
 void mlx5e_vxlan_init(struct mlx5e_priv *priv);
 void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv);
 
-void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
-   u16 port, int add);
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add);
 struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port);
 
 #endif /* __MLX5_VXLAN_H__ */
-- 
2.17.0



[pull request][net-next 00/13] Mellanox, mlx5 updates 2018-07-27 (Vxlan updates)

2018-07-27 Thread Saeed Mahameed
Hi Dave,

This series from Gal and Saeed provides updates to mlx5 vxlan implementation.

For more information please see tag log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21:

  net: sched: don't dump chains only held by actions (2018-07-27 09:38:46 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-07-27

for you to fetch changes up to 50920f3f268405af2b047ae9277efdbf4ed7510f:

  net/mlx5e: Issue direct lookup on vxlan ports by vport representors 
(2018-07-27 13:57:55 -0700)


mlx5e-updates-2018-07-27 (Vxlan updates)

This series from Gal and Saeed provides updates to mlx5 vxlan implementation.

Gal, started with three cleanups to reflect the actual hardware vxlan state
- reflect 4789 UDP port default addition to software database
- check maximum number of vxlan  UDP ports
- cleanup an unused member in vxlan work

Then Gal provides two performance optimizations by replacing the
vxlan radix tree with a hash table, and replacing the vxlan table
spin lock with a read-write lock.

Measuring mlx5e_vxlan_lookup_port execution time:

  Radix Tree   Hash Table
 ---  
  Single Stream   161 ns   79  ns (51% improvement)
  Multi Stream259 ns   136 ns (47% improvement)

Measuring UDP stream packet rate, single fully utilized TX core:
Radix Tree: 498,300 PPS
Hash Table: 555,468 PPS (11% improvement)

Next, from Saeed, vxlan refactoring to allow sharing the vxlan table
between different mlx5 netdevice instances like PF and VF representors,
this is done by making mlx5 vxlan interface more generic and decoupling
it from PF netdevice structures and logic, then moving it into mlx5 core
as a low level interface so it can be used by VF representors, which is
illustrated in the last patch of the serious.

-Saeed.


Gal Pressman (5):
  net/mlx5e: Vxlan, reflect 4789 UDP port default addition to software 
database
  net/mlx5e: Vxlan, check maximum number of UDP ports
  net/mlx5e: Vxlan, replace ports radix-tree with hash table
  net/mlx5e: Vxlan, replace spinlock with read-write lock
  net/mlx5e: Vxlan, cleanup an unused member in vxlan work

Saeed Mahameed (8):
  net/mlx5e: Vxlan, add direct delete function
  net/mlx5e: Vxlan, move netdev only logic to en_main.c
  net/mlx5e: Vxlan, rename struct mlx5e_vxlan to mlx5_vxlan_port
  net/mlx5e: Vxlan, rename from mlx5e to mlx5
  net/mlx5e: Vxlan, return values for add/del port
  net/mlx5e: Vxlan, add sync lock for add/del vxlan port
  net/mlx5e: Vxlan, move vxlan logic to core driver
  net/mlx5e: Issue direct lookup on vxlan ports by vport representors

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   6 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  71 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  14 +-
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c| 230 +
 .../ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h  |  39 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c| 190 -
 include/linux/mlx5/driver.h|   2 +
 include/linux/mlx5/mlx5_ifc.h  |   4 +-
 10 files changed, 325 insertions(+), 240 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
 rename drivers/net/ethernet/mellanox/mlx5/core/{ => lib}/vxlan.h (66%)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c


[net-next 03/13] net/mlx5e: Vxlan, replace ports radix-tree with hash table

2018-07-27 Thread Saeed Mahameed
From: Gal Pressman 

The VXLAN database is accessed in the data path for each VXLAN TX skb in
order to check whether the UDP port is being offloaded or not.
The number of elements in the database is relatively small, we can
simplify the radix-tree to a hash table and speedup the lookup process.

Measuring mlx5e_vxlan_lookup_port execution time:

  Radix Tree   Hash Table
 ---  
  Single Stream   161 ns   79  ns (51% improvement)
  Multi Stream259 ns   136 ns (47% improvement)

Measuring UDP stream packet rate, single fully utilized TX core:
Radix Tree: 498,300 PPS
Hash Table: 555,468 PPS (11% improvement)

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +-
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 41 +++
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  1 +
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c4d4db8722f5..6878925c3abf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -656,7 +656,8 @@ enum {
 
 struct mlx5e_vxlan_db {
spinlock_t  lock; /* protect vxlan table */
-   struct radix_tree_root  tree;
+   /* max_num_ports is usuallly 4, 16 buckets is more than enough */
+   DECLARE_HASHTABLE(htable, 4);
int num_ports;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index e3af2efe18ce..3c0ea9bc20e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -43,7 +43,7 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
 
spin_lock_init(_db->lock);
-   INIT_RADIX_TREE(_db->tree, GFP_ATOMIC);
+   hash_init(vxlan_db->htable);
 
if (mlx5e_vxlan_allowed(priv->mdev))
/* Hardware adds 4789 by default.
@@ -79,13 +79,27 @@ static int mlx5e_vxlan_core_del_port_cmd(struct 
mlx5_core_dev *mdev, u16 port)
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 }
 
+static struct mlx5e_vxlan *mlx5e_vxlan_lookup_port_locked(struct mlx5e_priv 
*priv,
+ u16 port)
+{
+   struct mlx5e_vxlan_db *vxlan_db = >vxlan;
+   struct mlx5e_vxlan*vxlan;
+
+   hash_for_each_possible(vxlan_db->htable, vxlan, hlist, port) {
+   if (vxlan->udp_port == port)
+   return vxlan;
+   }
+
+   return NULL;
+}
+
 struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
 
spin_lock_bh(_db->lock);
-   vxlan = radix_tree_lookup(_db->tree, port);
+   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
spin_unlock_bh(_db->lock);
 
return vxlan;
@@ -95,7 +109,6 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
-   int err;
 
vxlan = mlx5e_vxlan_lookup_port(priv, port);
if (vxlan) {
@@ -121,16 +134,12 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, 
u16 port)
atomic_set(>refcount, 1);
 
spin_lock_bh(_db->lock);
-   err = radix_tree_insert(_db->tree, vxlan->udp_port, vxlan);
+   hash_add(vxlan_db->htable, >hlist, port);
spin_unlock_bh(_db->lock);
-   if (err)
-   goto err_free;
 
vxlan_db->num_ports++;
return;
 
-err_free:
-   kfree(vxlan);
 err_delete_port:
mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 }
@@ -161,12 +170,12 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
 
mutex_lock(>state_lock);
spin_lock_bh(_db->lock);
-   vxlan = radix_tree_lookup(_db->tree, port);
+   vxlan = mlx5e_vxlan_lookup_port_locked(priv, port);
if (!vxlan)
goto out_unlock;
 
if (atomic_dec_and_test(>refcount)) {
-   radix_tree_delete(_db->tree, port);
+   hash_del(>hlist);
remove = true;
}
 
@@ -206,13 +215,13 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 {
struct mlx5e_vxlan_db *vxlan_db = >vxlan;
struct mlx5e_vxlan *vxlan;
-   unsigned int port = 0;
+   struct hlist_node *tmp;
+   int bkt;
 
-   /* Lockless since we are the only radix-tree consumers, wq is disabled 
*/
-   while (radix_tree_gang_lookup(_db->tree, (void **), port, 
1)) {
-   port = vxlan->udp_port;
-   radix_tree_delete(_db->tree, port);
-   mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+   /* 

Re: [PATCH net-next] net: dcb: add DSCP to comment about priority selector types

2018-07-27 Thread Petr Machata
Jakub Kicinski  writes:

> Commit ee2059819450 ("net/dcb: Add dscp to priority selector type")
> added a define for the new DSCP selector type created by
> IEEE 802.1Qcd, but missed the comment enumerating all selector types.
> Update the comment.
>
> Signed-off-by: Jakub Kicinski 

Reviewed-by: Petr Machata 


Re: [iproute PATCH] lib/namespace: avoid double-mounting a /sys

2018-07-27 Thread Stephen Hemminger
On Tue, 24 Jul 2018 19:26:38 +0200
Lubomir Rintel  wrote:

> This partly reverts 8f0807023d067e2bb585a2ae8da93e59689d10f1, bringing
> back the umount(/sys) attempt.
> 
> In a LXC container we're unable to umount the sysfs instance, nor mount
> a read-write one. We still are able to create a new read-only instance.
> 
> Nevertheless, it still makes sense to attempt the umount() even though
> the sysfs is mounted read-only. Otherwise we may end up attempting to
> mount a sysfs with the same flags as is already mounted, resulting in
> an EBUSY error (meaning "Already mounted").
> 
> Perhaps this is not a very likely scenario in real world, but we hit
> it in NetworkManager test suite and makes netns_switch() somewhat more
> robust. It also fixes the case, when /sys wasn't mounted at all.
> 
> Signed-off-by: Lubomir Rintel 

Makes sens applied.



[PATCHi iproute2-next] ip: show min and max mtu

2018-07-27 Thread Stephen Hemminger
From: Stephen Hemminger 

Add min/max MTU to the link details

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/if_link.h |  2 ++
 ip/ipaddress.c   | 10 ++
 2 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 26e8cf8b45aa..8456ff254015 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -164,6 +164,8 @@ enum {
IFLA_CARRIER_UP_COUNT,
IFLA_CARRIER_DOWN_COUNT,
IFLA_NEW_IFINDEX,
+   IFLA_MIN_MTU,
+   IFLA_MAX_MTU,
__IFLA_MAX
 };
 
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index bcee9ab731ce..85958e1a9cef 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1012,6 +1012,16 @@ int print_linkinfo(const struct sockaddr_nl *who,
   " promiscuity %u ",
   rta_getattr_u32(tb[IFLA_PROMISCUITY]));
 
+   if (tb[IFLA_MIN_MTU])
+   print_uint(PRINT_ANY,
+  "min_mtu", "minmtu %u ",
+  rta_getattr_u32(tb[IFLA_MIN_MTU]));
+
+   if (tb[IFLA_MAX_MTU])
+   print_uint(PRINT_ANY,
+  "max_mtu", "maxmtu %u ",
+  rta_getattr_u32(tb[IFLA_MAX_MTU]));
+
if (tb[IFLA_LINKINFO])
print_linktype(fp, tb[IFLA_LINKINFO]);
 
-- 
2.18.0



[PATCH net-next 0/3] mtu related changes

2018-07-27 Thread Stephen Hemminger
While looking at other MTU issues, noticed a couple oppurtunties
for improving user experience.

Stephen Hemminger (3):
  failover: change mtu has RTNL
  net: report min and max mtu network device settings
  net: report invalid mtu value via netlink extack

 drivers/net/net_failover.c   |  4 ++--
 include/linux/netdevice.h|  2 ++
 include/uapi/linux/if_link.h |  2 ++
 net/core/dev.c   | 23 +--
 net/core/rtnetlink.c |  8 +++-
 5 files changed, 30 insertions(+), 9 deletions(-)

-- 
2.18.0



[PATCH net-next 1/3] failover: change mtu has RTNL

2018-07-27 Thread Stephen Hemminger
When changing MTU, RTNL is held so use rtnl_dereference
instead of rcu_dereference.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/net_failover.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index d00d42c845b7..7ae1856d1f18 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -220,14 +220,14 @@ static int net_failover_change_mtu(struct net_device 
*dev, int new_mtu)
struct net_device *primary_dev, *standby_dev;
int ret = 0;
 
-   primary_dev = rcu_dereference(nfo_info->primary_dev);
+   primary_dev = rtnl_dereference(nfo_info->primary_dev);
if (primary_dev) {
ret = dev_set_mtu(primary_dev, new_mtu);
if (ret)
return ret;
}
 
-   standby_dev = rcu_dereference(nfo_info->standby_dev);
+   standby_dev = rtnl_dereference(nfo_info->standby_dev);
if (standby_dev) {
ret = dev_set_mtu(standby_dev, new_mtu);
if (ret) {
-- 
2.18.0



[PATCH net-next 3/3] net: report invalid mtu value via netlink extack

2018-07-27 Thread Stephen Hemminger
If an invalid MTU value is set through rtnetlink return extra error
information instead of putting message in kernel log. For other cases
where there is no visible API, keep the error report in the log.

Example:
# ip li set dev enp12s0 mtu 1
Error: mtu greater than device maximum.

# ifconfig enp12s0 mtu 1
SIOCSIFMTU: Invalid argument
# dmesg | tail -1
[ 2047.795467] enp12s0: mtu greater than device maximum

Signed-off-by: Stephen Hemminger 
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c| 23 +--
 net/core/rtnetlink.c  |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1295c7a452e..9c917467a2c7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, 
size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
+int dev_set_mtu_ext(struct net_device *dev, int mtu,
+   struct netlink_ext_ack *extack);
 int dev_set_mtu(struct net_device *, int);
 int dev_change_tx_queue_len(struct net_device *, unsigned long);
 void dev_set_group(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index 87c42c8249ae..89031b5fef9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7523,13 +7523,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
 EXPORT_SYMBOL(__dev_set_mtu);
 
 /**
- * dev_set_mtu - Change maximum transfer unit
+ * dev_set_mtu_ext - Change maximum transfer unit
  * @dev: device
  * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
  *
  * Change the maximum transfer size of the network device.
  */
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+   struct netlink_ext_ack *extack)
 {
int err, orig_mtu;
 
@@ -7538,14 +7540,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 
/* MTU must be positive, and in range */
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
-   net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
-   dev->name, new_mtu, dev->min_mtu);
+   NL_SET_ERR_MSG(extack, "mtu less than device minimum");
return -EINVAL;
}
 
if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
-   net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
-   dev->name, new_mtu, dev->max_mtu);
+   NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
return -EINVAL;
}
 
@@ -7573,6 +7573,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
}
return err;
 }
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+   struct netlink_ext_ack extack;
+   int err;
+
+   err = dev_set_mtu_ext(dev, new_mtu, );
+   if (err)
+   net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+   return err;
+}
 EXPORT_SYMBOL(dev_set_mtu);
 
 /**
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 510d4f765a13..24431e578310 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2382,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb,
}
 
if (tb[IFLA_MTU]) {
-   err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+   err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
-- 
2.18.0



[PATCH net-next 2/3] net: report min and max mtu network device settings

2018-07-27 Thread Stephen Hemminger
Report the minimum and maximum MTU allowed on a device
via netlink so that it can be displayed by tools like
ip link.

Signed-off-by: Stephen Hemminger 
---
 include/uapi/linux/if_link.h | 2 ++
 net/core/rtnetlink.c | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 01b5069a73a5..1fea3ff73bf6 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -164,6 +164,8 @@ enum {
IFLA_CARRIER_UP_COUNT,
IFLA_CARRIER_DOWN_COUNT,
IFLA_NEW_IFINDEX,
+   IFLA_MIN_MTU,
+   IFLA_MAX_MTU,
__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 92b6fa5d5f6e..510d4f765a13 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1015,6 +1015,8 @@ static noinline size_t if_nlmsg_size(const struct 
net_device *dev,
   + nla_total_size(4)  /* IFLA_IF_NETNSID */
   + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
   + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
+  + nla_total_size(4)  /* IFLA_MIN_MTU */
+  + nla_total_size(4)  /* IFLA_MAX_MTU */
   + 0;
 }
 
@@ -1601,6 +1603,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
   netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+   nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
+   nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
@@ -1732,6 +1736,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_IF_NETNSID]   = { .type = NLA_S32 },
[IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+   [IFLA_MIN_MTU]  = { .type = NLA_U32 },
+   [IFLA_MAX_MTU]  = { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
-- 
2.18.0



Re: [PATCH net-next] cxgb4: print ULD queue information managed by LLD

2018-07-27 Thread David Miller
From: Rahul Lakkireddy 
Date: Fri, 27 Jul 2018 14:29:22 +0530

> Signed-off-by: Rahul Lakkireddy 
> Signed-off-by: Ganesh Goudar 

Applied, thanks.


Re: [PATCH net-next 0/4] l2tp: remove unused session fields

2018-07-27 Thread David Miller
From: Guillaume Nault 
Date: Fri, 27 Jul 2018 10:59:55 +0200

> Several fields of the session structures can be set, but remain unused
> otherwise.
> This series removes these fields and explicitely ignores the associated
> ioctls and netlink attributes.

Series applied, thank you.


Re: [PATCH net-next 0/8] mlxsw: Support DSCP prioritization and rewrite

2018-07-27 Thread David Miller
From: Ido Schimmel 
Date: Fri, 27 Jul 2018 15:26:54 +0300

> Petr says:
> 
> On ingress, a network device such as a switch assigns to packets
> priority based on various criteria. Common options include interpreting
> PCP and DSCP fields according to user configuration. When a packet
> egresses the switch, a reverse process may rewrite PCP and/or DSCP
> headers according to packet priority.
> 
> So far, mlxsw has supported prioritization based on PCP (802.1p priority
> tag). This patch set introduces support for prioritization based on
> DSCP, and DSCP rewrite.
> 
> To configure the DSCP-to-priority maps, the user is expected to invoke
> ieee_setapp and ieee_delapp DCBNL ops, e.g. by using lldptool:
 ...
> With this patch set, mlxsw uses these values to configure priority for
> DSCP values not explicitly specified in DSCP APP map. In the future we
> expect to also use this to configure default port priority for untagged
> packets.
> 
> Access to DSCP-to-priority map, priority-to-DSCP map, and default
> priority for a port is exposed through three new DCB helpers. Like the
> already-existing dcb_ieee_getapp_mask() helper, these helpers operate in
> terms of bitmaps, to support the arbitrary M:N mapping that the APP
> rules allow. Such interface presents all the relevant information from
> the APP database without necessitating exposition of iterators, locking
> or other complex primitives. It is up to the driver to then digest the
> mapping in a way that the device supports. In this patch set, mlxsw
> resolves conflicts by favoring higher-numbered DSCP values and
> priorities.

Series applied, thank you!


[PATCH net-next] net: dcb: add DSCP to comment about priority selector types

2018-07-27 Thread Jakub Kicinski
Commit ee2059819450 ("net/dcb: Add dscp to priority selector type")
added a define for the new DSCP selector type created by
IEEE 802.1Qcd, but missed the comment enumerating all selector types.
Update the comment.

Signed-off-by: Jakub Kicinski 
---
 include/uapi/linux/dcbnl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 60aa2e446698..69df19aa8e72 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -233,7 +233,8 @@ struct cee_pfc {
  * 2   Well known port number over TCP or SCTP
  * 3   Well known port number over UDP or DCCP
  * 4   Well known port number over TCP, SCTP, UDP, or DCCP
- * 5-7 Reserved
+ * 5   Differentiated Services Code Point (DSCP) value
+ * 6-7 Reserved
  *
  *  Selector field values for CEE
  * 0   Ethertype
-- 
2.17.1



[PATCH] phy: Move "device present" masks earlier in file

2018-07-27 Thread Robert P. J. Day
Move the "device present" mask bits up immediately after the MMD
device definitions, since it makes no sense to have them further down
in the file.

This is purely a cosmetic change for readability.

Signed-off-by: Robert P. J. Day 

---

  since the *only* thing that actually uses the MMD definitions in
that file are the mask bits, it only makes sense to put them next to
each other -- there should be no functional change.

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index d435b00d64ad..4897f17b0639 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -27,6 +27,17 @@
 #define MDIO_MMD_VEND1 30  /* Vendor specific 1 */
 #define MDIO_MMD_VEND2 31  /* Vendor specific 2 */

+/* Device present mask bits. */
+#define MDIO_DEVS_PRESENT(devad)   (1 << (devad))
+#define MDIO_DEVS_PMAPMD   MDIO_DEVS_PRESENT(MDIO_MMD_PMAPMD)
+#define MDIO_DEVS_WIS  MDIO_DEVS_PRESENT(MDIO_MMD_WIS)
+#define MDIO_DEVS_PCS  MDIO_DEVS_PRESENT(MDIO_MMD_PCS)
+#define MDIO_DEVS_PHYXS
MDIO_DEVS_PRESENT(MDIO_MMD_PHYXS)
+#define MDIO_DEVS_DTEXS
MDIO_DEVS_PRESENT(MDIO_MMD_DTEXS)
+#define MDIO_DEVS_TC   MDIO_DEVS_PRESENT(MDIO_MMD_TC)
+#define MDIO_DEVS_AN   MDIO_DEVS_PRESENT(MDIO_MMD_AN)
+#define MDIO_DEVS_C22EXT   MDIO_DEVS_PRESENT(MDIO_MMD_C22EXT)
+
 /* Generic MDIO registers. */
 #define MDIO_CTRL1 MII_BMCR
 #define MDIO_STAT1 MII_BMSR
@@ -113,17 +124,6 @@
 #define MDIO_PMA_SPEED_10  0x0040  /* 10M capable */
 #define MDIO_PCS_SPEED_10P2B   0x0002  /* 10PASS-TS/2BASE-TL capable */

-/* Device present registers. */
-#define MDIO_DEVS_PRESENT(devad)   (1 << (devad))
-#define MDIO_DEVS_PMAPMD   MDIO_DEVS_PRESENT(MDIO_MMD_PMAPMD)
-#define MDIO_DEVS_WIS  MDIO_DEVS_PRESENT(MDIO_MMD_WIS)
-#define MDIO_DEVS_PCS  MDIO_DEVS_PRESENT(MDIO_MMD_PCS)
-#define MDIO_DEVS_PHYXS
MDIO_DEVS_PRESENT(MDIO_MMD_PHYXS)
-#define MDIO_DEVS_DTEXS
MDIO_DEVS_PRESENT(MDIO_MMD_DTEXS)
-#define MDIO_DEVS_TC   MDIO_DEVS_PRESENT(MDIO_MMD_TC)
-#define MDIO_DEVS_AN   MDIO_DEVS_PRESENT(MDIO_MMD_AN)
-#define MDIO_DEVS_C22EXT   MDIO_DEVS_PRESENT(MDIO_MMD_C22EXT)
-
 /* Control register 2. */
 #define MDIO_PMA_CTRL2_TYPE0x000f  /* PMA/PMD type selection */
 #define MDIO_PMA_CTRL2_10GBCX4 0x  /* 10GBASE-CX4 type */

-- 


Robert P. J. Day Ottawa, Ontario, CANADA
  http://crashcourse.ca/dokuwiki

Twitter:   http://twitter.com/rpjday
LinkedIn:   http://ca.linkedin.com/in/rpjday



[PATCH net-next] Implement a rtnetlink device which simulates wifi.

2018-07-27 Thread Cody Schuffelen
The device added here is used through "ip link add ... type virt_wifi"
The intention is to take over an existing network device and produce a
new one that appears like a wireless connection, returning enough canned
responses to nl80211 to satisfy a standard network manager. If
necessary, it can also be set up one step removed from an existing
network device, such as through a vlan/80211Q or macvlan connection to
not disrupt the existing network interface.

This is being used for Google's Remote Android Virtual Device project,
which runs Android devices in virtual machines. The standard network
interfaces provided inside the virtual machines are all ethernet.
However, Android is not interested in ethernet devices and would rather
connect to a wireless interface. This patch allows the virtual machine
guest to treat one of its network connections as wireless rather than
ethernet, satisfying Android's network connection requirements.

We believe this is a generally useful driver for simulating wireless
network connections in other environments where a wireless connection is
desired by some userspace process but is not available. Future work can
also include exporting the wireless control plane to userspace, so the
device can configure the behavior of the simulated wireless network
itself.

This is distinct from other testing efforts such as mac80211_hwsim by
being a cfg80211 device instead of mac80211 device, allowing straight
pass-through on the data plane instead of forcing packaging of ethernet
data into mac80211 frames.

Signed-off-by: A. Cody Schuffelen 
Acked-by: Alistair Strachan 
Acked-by: Greg Hartman 
---
 drivers/net/wireless/Kconfig |   7 +
 drivers/net/wireless/Makefile|   2 +
 drivers/net/wireless/virt_wifi.c | 544 +++
 3 files changed, 553 insertions(+)
 create mode 100644 drivers/net/wireless/virt_wifi.c

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 166920ae23f8..1781d8a7f05a 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -114,4 +114,11 @@ config USB_NET_RNDIS_WLAN
 
  If you choose to build a module, it'll be called rndis_wlan.
 
+config VIRT_WIFI
+   tristate "Wifi wrapper for ethernet drivers"
+   default n
+   ---help---
+ This option adds support for ethernet connections to appear as if they
+ are wifi connections through a special rtnetlink device.
+
 endif # WLAN
diff --git a/drivers/net/wireless/Makefile b/drivers/net/wireless/Makefile
index 7fc96306712a..6cfe74515c95 100644
--- a/drivers/net/wireless/Makefile
+++ b/drivers/net/wireless/Makefile
@@ -27,3 +27,5 @@ obj-$(CONFIG_PCMCIA_WL3501)   += wl3501_cs.o
 obj-$(CONFIG_USB_NET_RNDIS_WLAN)   += rndis_wlan.o
 
 obj-$(CONFIG_MAC80211_HWSIM)   += mac80211_hwsim.o
+
+obj-$(CONFIG_VIRT_WIFI)+= virt_wifi.o
diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
new file mode 100644
index ..602bf462b444
--- /dev/null
+++ b/drivers/net/wireless/virt_wifi.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/* drivers/net/wireless/virt_wifi.c
+ *
+ * A fake implementation of cfg80211_ops that can be tacked on to an ethernet
+ * net_device to make it appear as a wireless connection.
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * Author: schuffe...@google.com
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct virt_wifi_priv {
+   bool being_deleted;
+   struct cfg80211_scan_request *scan_request;
+   struct delayed_work scan_result;
+   struct delayed_work scan_complete;
+};
+
+static struct ieee80211_channel channel = {
+   .band = NL80211_BAND_5GHZ,
+   .center_freq = 5500,
+   .hw_value = 5500,
+
+   .flags = 0, /* ieee80211_channel_flags */
+   .max_antenna_gain = 20,
+   .max_power = 5500,
+   .max_reg_power = ,
+};
+
+static struct ieee80211_rate bitrate = {
+   .flags = IEEE80211_RATE_SHORT_PREAMBLE, /* ieee80211_rate_flags */
+   .bitrate = 1000,
+};
+
+static struct ieee80211_supported_band band_5ghz = {
+   .channels = ,
+   .bitrates = ,
+   .band = NL80211_BAND_5GHZ,
+   .n_channels = 1,
+   .n_bitrates = 1,
+};
+
+static struct cfg80211_inform_bss mock_inform_bss = {
+   /* ieee80211_channel* */ .chan = ,
+   /* nl80211_bss_scan_width */ .scan_width = NL80211_BSS_CHAN_WIDTH_20,
+   /* s32 */ .signal = 99,
+};
+
+static u8 fake_router_bssid[] = {4, 4, 4, 4, 4, 4};
+
+static int virt_wifi_scan(struct wiphy *wiphy,
+ struct cfg80211_scan_request *request)
+{
+   struct virt_wifi_priv *priv = wiphy_priv(wiphy);
+
+   wiphy_debug(wiphy, "scan\n");
+
+   if (priv->scan_request || priv->being_deleted)
+   return -EBUSY;
+
+   if (request->ie_len > 0)
+   wiphy_debug(wiphy, "scan: first ie: %d\n", (int)request->ie[0]);
+
+   if (request->n_ssids > 0) {
+   int i;
+

Hello Dear

2018-07-27 Thread Tracy William



Hello Dear, 

how are you today,I hope you are doing great. 

It is my great pleasure to contact you,I want to make a new and special 
friend,I hope you don't mind. My name is Tracy William from the United States, 
Am a french and English nationality. I will give you pictures and more details 
about my self as soon as i hear from you in my email account bellow, 

Thanks 
Tracy


Re: pull request (net-next): ipsec-next 2018-07-27

2018-07-27 Thread David Miller
From: Steffen Klassert 
Date: Fri, 27 Jul 2018 09:24:35 +0200

> 4) Add virtual xfrm interfaces. The purpose of these interfaces
>is to overcome the design limitations that the existing
>VTI devices have.

Thanks for taking the time to explain this in detail.  Looks
great.

Pulled, thanks Steffen.


Re: pull request (net): ipsec 2018-07-27

2018-07-27 Thread David Miller
From: Steffen Klassert 
Date: Fri, 27 Jul 2018 08:51:49 +0200

> 1) Fix PMTU handling of vti6. We update the PMTU on
>the xfrm dst_entry which is not cached anymore
>after the flowchache removal. So update the
>PMTU of the original dst_entry instead.
>From Eyal Birger.
> 
> 2) Fix a leak of kernel memory to userspace.
>From Eric Dumazet.
> 
> 3) Fix a possible dst_entry memleak in xfrm_lookup_route.
>From Tommi Rantala.
> 
> 4) Fix a skb leak in case we can't call nlmsg_multicast
>from xfrm_nlmsg_multicast. From Florian Westphal.
> 
> 5) Fix a leak of a temporary buffer in the error path of
>esp6_input. From Zhen Lei.
> 
> Please pull or let me know if there are problems.

Pulled, thanks Steffen!


[PATCH net] ipv4: remove BUG_ON() from fib_compute_spec_dst

2018-07-27 Thread Lorenzo Bianconi
Remove BUG_ON() from fib_compute_spec_dst routine and check
in_dev pointer during flowi4 data structure initialization.
fib_compute_spec_dst routine can be run concurrently with device removal
where ip_ptr net_device pointer is set to NULL. This can happen
if userspace enables pkt info on UDP rx socket and the device
is removed while traffic is flowing

Fixes: 35ebf65e851c ("ipv4: Create and use fib_compute_spec_dst() helper")
Signed-off-by: Lorenzo Bianconi 
---
 net/ipv4/fib_frontend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e46cdd310e5f..2998b0e47d4b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,19 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return ip_hdr(skb)->daddr;
 
in_dev = __in_dev_get_rcu(dev);
-   BUG_ON(!in_dev);
 
net = dev_net(dev);
 
scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+   bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
.flowi4_scope = scope,
-   .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+   .flowi4_mark = vmark ? skb->mark : 0,
};
if (!fib_lookup(net, , , 0))
return FIB_RES_PREFSRC(net, res);
-- 
2.17.1



Re: [PATCH net] net/mlx5e: Move mlx5e_priv_flags into en_ethtool.c

2018-07-27 Thread Kamal Heib
On Thu, Jul 26, 2018 at 11:51:49AM -0700, Saeed Mahameed wrote:
> On Sun, Jul 15, 2018 at 12:06 PM, Kamal Heib  wrote:
> > Move the definition of mlx5e_priv_flags into en_ethtool.c because it's
> > only used there.
> >
> > Fixes: 4e59e2888139 ("net/mlx5e: Introduce net device priv flags 
> > infrastructure")
> > Signed-off-by: Kamal Heib 
> > ---
> >  drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 ---
> >  drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 +++
> >  2 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
> > b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> > index eb9eb7aa953a..84e6a5b42286 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> > @@ -208,13 +208,6 @@ struct mlx5e_umr_wqe {
> >
> >  extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
> >
> > -static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
> > -   "rx_cqe_moder",
> > -   "tx_cqe_moder",
> > -   "rx_cqe_compress",
> > -   "rx_striding_rq",
> > -};
> > -
> 
> Hi Kamal, on a second thought, i would like to drop this change and
> keep mlx5e_priv_flags close/local to the below mlx5e_priv_flag.
> 
> Please let me know.
>

Hi,

Basically this change came to avoid the following warning when compiling
the mlx5 driver with "W=1" flag and this error will appear for each
file that include the "en.h". 

So, I suggest not drop this change.

In file included from drivers/net/ethernet/mellanox/mlx5/core//en_main.c:40:0:
drivers/net/ethernet/mellanox/mlx5/core//en.h:206:19: warning: 
‘mlx5e_priv_flags’ defined but not used [-Wunused-const-variable=]
 static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {

Thanks,
Kamal

> >  enum mlx5e_priv_flag {
> > MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0),
> > MLX5E_PFLAG_TX_CQE_BASED_MODER = (1 << 1),
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
> > b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> > index fffe514ba855..2a1c35d82c2e 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> > @@ -33,6 +33,13 @@
> >  #include "en.h"
> >  #include "en/port.h"
> >
> > +static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
> > +   "rx_cqe_moder",
> > +   "tx_cqe_moder",
> > +   "rx_cqe_compress",
> > +   "rx_striding_rq",
> > +};
> > +
> >  void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
> >struct ethtool_drvinfo *drvinfo)
> >  {
> > --
> > 2.14.4
> >


Re: Deadlock with restart_syscall()

2018-07-27 Thread Stephen Hemminger
On Mon, 16 Jul 2018 09:31:06 +0200
André Pribil  wrote:

> Hello,
> 
> I'm using kernel 4.14.52-rt34 on a single core ARM system and I'm seeing a 
> deadlock inside the kernel when two RT processes make calls in the right 
> temporal distance. The first process is trying to bring the Ethernet 
> interface 
> up, with the SIOCGIFFLAGS ioctl(). The second process is checking the 
> Ethernet 
> carrier, speed and duplex status, by reading e.g. "/sys/class/net/eth1/speed".
> 
> The first process finally gets to phy_poll_reset() in 
> drivers/net/phy/phy_device.c, where it calls msleep(50). 
> It never returns from the sleep.
> 
> The second process gets to speed_show() in net/core/net-sysfs.c. It tries to 
> get
> the RTNL lock with rtnl_trylock(), but fails and calls restart_syscall(). 
> This happens over and over again.
> 
> It seems like the first process in no longer scheduled and cannot release the
> RTNL lock, while the second process is busy restarting the syscall. The first 
> process has a higher RT priority than the second process.
>  
> Just for testing I've added the TIF_NEED_RESCHED flag to the 
> restart_syscall() 
> function and I did not see the deadlock again with this change.
> 
> static inline int restart_syscall(void)
> {
>   set_tsk_thread_flag(current, TIF_SIGPENDING | TIF_NEED_RESCHED);
>   return -ERESTARTNOINTR;
> }
> 
> As a second test I released the RTNL lock while calling msleep() in 
> phy_poll_reset(). This also made the problem disappear.
> 
> I've found this thread, where a similar issue with restart_syscall() has been 
> reported:
> https://www.spinics.net/lists/netdev/msg415144.html
> 
> Any ideas how to fix this issue?
> 
> Andre   

Don't do control operations from RT processes!
There can be cases of priority inversion where RT process is waiting for
something that requires a kthread to complete the operation.


Re: [net-next v5 3/3] net/tls: Remove redundant array allocation.

2018-07-27 Thread Dave Watson
On 07/27/18 09:34 AM, Vakul Garg wrote:
> 
> 
> > -Original Message-
> > From: Dave Watson [mailto:davejwat...@fb.com]
> > Sent: Thursday, July 26, 2018 2:31 AM
> > To: Vakul Garg 
> > Cc: David Miller ; netdev@vger.kernel.org;
> > bor...@mellanox.com; avia...@mellanox.com; Doron Roberts-Kedes
> > 
> > Subject: Re: [net-next v5 3/3] net/tls: Remove redundant array allocation.
> > 
> > On 07/24/18 08:22 AM, Vakul Garg wrote:
> > > Will it be a bad idea to get rid of array 'sgin' on stack and simply
> > > kmalloc 'sgin' for whatever the number the number of pages returned by
> > iov_iter_npages()?
> > > We can allocate for sgout too with the same kmalloc().
> > >
> > > (Using a local array based 'sgin' is coming in the way to achieve
> > > sending multiple async record decryption requests to the accelerator
> > > without waiting for previous one to complete.)
> > 
> > Yes we could do this, and yes we would need to heap allocate if you want to
> > support multiple outstanding decryption requests.  I think async crypto
> > prevents any sort of zerocopy-fastpath, however.
> 
> We already do a aead_request_alloc (which internally does kmalloc).
> To mitigate the cost of kmalloc/kfree for sg lists and aad, I am allocating a 
> combined memory chunk for all of these and then segmenting it into
> aead_req, aad, sgin, sgout. This way there should be no extra cost for
> memory allocations in non-async.

Makes sense, sounds good to me. 


Re: [PATCH linux-firmware] Mellanox: Add new mlxsw_spectrum firmware 13.1702.6

2018-07-27 Thread Josh Boyer
Applied and pushed out.

josh
On Sun, Jul 22, 2018 at 7:43 AM Nir Dotan  wrote:
>
> This new firmware contains: - Support for new types of cables - Support for 
> flashing future firmware without reboot - Support for Router ARP BC and UC 
> traps Signed-off-by: Nir Dotan --- WHENCE | 3 ++- 
> mellanox/mlxsw_spectrum-13.1702.6.mfa2 | Bin 0 -> 863220 bytes 2 files 
> changed, 2 insertions(+), 1 deletion(-) create mode 100644 
> mellanox/mlxsw_spectrum-13.1702.6.mfa2 diff --git a/WHENCE b/WHENCE index 
> 6771831..cdc78ba 100644 --- a/WHENCE +++ b/WHENCE @@ -4050,9 +4050,10 @@ 
> Driver: mlxsw_spectrum - Mellanox Spectrum switch File: 
> mellanox/mlxsw_spectrum-13.1420.122.mfa2 File: 
> mellanox/mlxsw_spectrum-13.1530.152.mfa2 File: 
> mellanox/mlxsw_spectrum-13.1620.192.mfa2 +File: 
> mellanox/mlxsw_spectrum-13.1702.6.mfa2 Licence: - Copyright (c) 2017 Mellanox 
> Technologies, Ltd. All rights reserved. + Copyright (c) 2017-2018 Mellanox 
> Technologies, Ltd. All rights reserved. Redistribution and use in source and 
> binary forms, with or without modification, are permitted provided that the 
> following conditions are met: diff --git 
> a/mellanox/mlxsw_spectrum-13.1702.6.mfa2 
> b/mellanox/mlxsw_spectrum-13.1702.6.mfa2 new file mode 100644 index 
> ..0186d19c8285f161d2ffc6644e8451ff448e1684
>  GIT binary patch literal 863220 
> zcma}I-ySoJq?(PJ4cXuafa19>Z-QC@TJHg%E-Q9j>=Dm9N{xkPZHdRzo 
> z^;Oenb@yIt?>f>_GHSHaBK(ZBY8tfk^n?H)04M+i-~s>!-~yOFegMdr0644_z{dqX 
> zj?kAM8*A_Tw!NdthfVxS2A`F}D1L54q+7Jk4ZL)i4(M*O6%z1p)MWIYu?0og5# 
> zs;d2Z3-Do_hBdWTUTmO^FE-bQj)2dT-{-#LLO@HU?}NuM;t0F{^X~(HLd0NCp5P0ziJ4e_j8`_5Zhw|7QQC|J~C6&{q5T8ThX)3+w;c
>  ze`E;w5BdI2SO#Y1|FfX~|A&0R|H3kTB*DPI@~1U}_*={RCky$vmibQ> 
> z>TfO6pDgs>TE;(Fn7_3Qf3mRu%jy1W@c+sFqbL8lzJIm|e_{Wp 
> zALyAm{sI2gzleWf{}q?xPoE9xFDxTF>%Y|)`ETu?YmD-@_Rlp&{agFz8l(NK{d0}c 
> z|JMGw#u)$2{-MU0e_{W(7VLkh1=e5Kf35GIYmEID_J89tGyVhotHwD0#j^b);a2 
> zjBG3)HU4ug@c!2RdFDU;H~WVi@c-8SxyA&4YyVth!oRYt|9-C~`b*37Z})ZLzq0>a 
> zvp@F^$$zpRGykvk{d0{!|0nw~^Z&_`{)PSDe)!{_MfMk#k)8eDJ{$R8*#C{o_J{tZ 
> z_Xqt=Ni-hm1Sc4w^}g#h5g@Ju>N5`{^$7r^GpB)Ao)0*{Odmd_J-| 
> zac+VG2NV!sLj2#o`v12?SZV+w!lvJ{LRu>~^=eOb} 
> zkG1}GzCVx3K5X4@#a#op{jlGD$9^20Z$9jAtphL-kJ*RS`>nV_Y6}1(#e_CXQbK 
> zL@H9hWrK&)KCIAh*;2xb5BtyOl>atwz@si7v-z7joR592Hq=xdJ3fSMqWiJ`&-?3mQiXZn90=?g|EZ=Z{H(xN-xL^Pg(Zp}Xb#45p1tIxw+0=!s
>  zk2A)?Z`roRj~oDlzh&2#^Z-P;O21>lfjvL=0Kjj}8#syAN3R2ZYc9Y=yyZS@=Wpd; 
> z=0ot2FXnIAP~wks#>;Qb5O_HOH-PXt?f;Ma%I2n~@$q)UoM?^pW1j%}#{ 
> zlKuI+HUKFqfaDDt>~J%Zvf+pIiBH6}n7JaSv6XkZ@-8-X3#T1yo=%rszZS7~rV|9Y 
> zqTziaO|W*}(ME-pJ_gbRH%QKlj_0~rr=tejK}M-3{=Pd(v=x!6}^N 
> z2V}^`u|UUEuvwh=MqHTs)6ZX9>PQHjtJ+D~_z#>no%IfEG6tos=s|n7QP7c$X(F 
> z)}pf`%+Uk+BrQJ9n`$vgv@x9<_0*$_jCBJ3hAnN)x{wqSFi;R_wAbegih)~v|E#7p 
> z(YjVBJ`ly|_hQ-N+Pk>VNf?r`lNu#8o8SWkd-Bv4Tb!C2I>^I63OaMEJf~Rc{ZG9a 
> zzD{?r5+8BG9U5$BUkq4ZqPTYG+-#|zr_=djA46<=ba`IPVd_@i8oiz5K1!;dx7 
> z4U}^9h(C{m`5(PUpokm!Y5f|^J$XzyE-JD4wsCL5H1sX1Km0}6SNYDXq^8N2MslTb 
> 

Re: [PATCH iproute2 0/3] l2tp: remove unused fields in struct l2tp_parm

2018-07-27 Thread Guillaume Nault
On Fri, Jul 27, 2018 at 07:57:12AM -0700, Stephen Hemminger wrote:
> On Fri, 27 Jul 2018 12:26:28 +0200
> Guillaume Nault  wrote:
> 
> > Several fields of struct l2tp_parm are handled by create_session() but
> > can't actually be set by user.
> > Most of these fields can also be set by get_response(), but are ignored
> > afterwards.
> > 
> > Since these fields can't have any visible effect, let's just remove
> > them.
> > 
> > Guillaume Nault (3):
> >   l2tp: drop data_seq
> >   l2tp: drop mtu
> >   l2tp: drop lns_mode
> > 
> >  ip/ipl2tp.c | 13 -
> >  1 file changed, 13 deletions(-)
> > 
> 
> These make sense for iproute2 next
These patches haven't been rejected in patchwork. Does that mean that
David A. will pick them up? Or should I repost to iproute2-next anyway?



Re:

2018-07-27 Thread Laura Marcela Ramirez Romero


Has ganado 1,500,000.00 usd Correo de contacto: 
juanabellofundat...@yandex.com




Estimados todos,

Espero que este correo os encuentre bien. Os envĂ­o los links de los dos 
primeros volĂºmenes de la ColecciĂ³n Perspectivas Iberoamericanas sobre la 
Justicia, en la que



Re: [PATCH iproute2 0/3] l2tp: remove unused fields in struct l2tp_parm

2018-07-27 Thread Stephen Hemminger
On Fri, 27 Jul 2018 12:26:28 +0200
Guillaume Nault  wrote:

> Several fields of struct l2tp_parm are handled by create_session() but
> can't actually be set by user.
> Most of these fields can also be set by get_response(), but are ignored
> afterwards.
> 
> Since these fields can't have any visible effect, let's just remove
> them.
> 
> Guillaume Nault (3):
>   l2tp: drop data_seq
>   l2tp: drop mtu
>   l2tp: drop lns_mode
> 
>  ip/ipl2tp.c | 13 -
>  1 file changed, 13 deletions(-)
> 

These make sense for iproute2 next


[PATCH 5/5] net: remove sock_poll_busy_flag

2018-07-27 Thread Christoph Hellwig
Fold it into the only caller to make the code simpler and easier to read.

Signed-off-by: Christoph Hellwig 
---
 include/net/busy_poll.h |  6 --
 net/socket.c| 16 +++-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 25d762cf47f2..71c72a939bf8 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,12 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int 
nonblock)
 #endif
 }
 
-/* if this socket can poll_ll, tell the system call */
-static inline __poll_t sock_poll_busy_flag(struct socket *sock)
-{
-   return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0;
-}
-
 /* used in the NIC receive handler to mark the skb */
 static inline void skb_mark_napi_id(struct sk_buff *skb,
struct napi_struct *napi)
diff --git a/net/socket.c b/net/socket.c
index 399d2ccec89d..475247e347ae 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1128,15 +1128,21 @@ EXPORT_SYMBOL(sock_create_lite);
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
struct socket *sock = file->private_data;
-   __poll_t events = poll_requested_events(wait);
+   __poll_t events = poll_requested_events(wait), flag = 0;
 
if (!sock->ops->poll)
return 0;
 
-   /* poll once if requested by the syscall */
-   if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP))
-   sk_busy_loop(sock->sk, 1);
-   return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
+   if (sk_can_busy_loop(sock->sk)) {
+   /* poll once if requested by the syscall */
+   if (events & POLL_BUSY_LOOP)
+   sk_busy_loop(sock->sk, 1);
+
+   /* if this socket can poll_ll, tell the system call */
+   flag = POLL_BUSY_LOOP;
+   }
+
+   return sock->ops->poll(file, sock, wait) | flag;
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
-- 
2.18.0



[PATCH 3/5] net: don not detour through struct sock to find the poll waitqueue

2018-07-27 Thread Christoph Hellwig
For any open socket file descriptor sock->sk->sk_wq->wait will always
point to sock->wq->wait.  That means we can do the shorter dereference
and removal a NULL check and don't have to not worry about any RCU
protection.

Signed-off-by: Christoph Hellwig 
---
 include/net/sock.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 946ee8651714..9b6011912691 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2001,10 +2001,9 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
 static inline void sock_poll_wait(struct file *filp, poll_table *p)
 {
struct socket *sock = filp->private_data;
-   wait_queue_head_t *wq = sk_sleep(sock->sk);
 
-   if (!poll_does_not_wait(p) && wq) {
-   poll_wait(filp, wq, p);
+   if (!poll_does_not_wait(p)) {
+   poll_wait(filp, >wq->wait, p);
/* We need to be sure we are in sync with the
 * socket flags modification.
 *
-- 
2.18.0



[PATCH 2/5] net: simplify sock_poll_wait

2018-07-27 Thread Christoph Hellwig
The wait_address argument is always directly derived from the filp
argument, so remove it.

Signed-off-by: Christoph Hellwig 
---
 crypto/af_alg.c|  2 +-
 include/net/sock.h | 11 ++-
 net/atm/common.c   |  2 +-
 net/caif/caif_socket.c |  2 +-
 net/core/datagram.c|  2 +-
 net/dccp/proto.c   |  2 +-
 net/ipv4/tcp.c |  2 +-
 net/iucv/af_iucv.c |  2 +-
 net/nfc/llcp_sock.c|  2 +-
 net/rxrpc/af_rxrpc.c   |  2 +-
 net/smc/af_smc.c   |  2 +-
 net/tipc/socket.c  |  2 +-
 net/unix/af_unix.c |  4 ++--
 13 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index c166f424871c..b053179e0bc5 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1071,7 +1071,7 @@ __poll_t af_alg_poll(struct file *file, struct socket 
*sock,
struct af_alg_ctx *ctx = ask->private;
__poll_t mask;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
mask = 0;
 
if (!ctx->more || ctx->used)
diff --git a/include/net/sock.h b/include/net/sock.h
index ad85d37c83c8..946ee8651714 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1994,16 +1994,17 @@ static inline bool skwq_has_sleeper(struct socket_wq 
*wq)
 /**
  * sock_poll_wait - place memory barrier behind the poll_wait call.
  * @filp:   file
- * @wait_address:   socket wait queue
  * @p:  poll_table
  *
  * See the comments in the wq_has_sleeper function.
  */
-static inline void sock_poll_wait(struct file *filp,
-   wait_queue_head_t *wait_address, poll_table *p)
+static inline void sock_poll_wait(struct file *filp, poll_table *p)
 {
-   if (!poll_does_not_wait(p) && wait_address) {
-   poll_wait(filp, wait_address, p);
+   struct socket *sock = filp->private_data;
+   wait_queue_head_t *wq = sk_sleep(sock->sk);
+
+   if (!poll_does_not_wait(p) && wq) {
+   poll_wait(filp, wq, p);
/* We need to be sure we are in sync with the
 * socket flags modification.
 *
diff --git a/net/atm/common.c b/net/atm/common.c
index a7a68e509628..9f8cb0d2e71e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, 
poll_table *wait)
struct atm_vcc *vcc;
__poll_t mask;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
mask = 0;
 
vcc = ATM_SD(sock);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a6fb1b3bcad9..d18965f3291f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file,
__poll_t mask;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
mask = 0;
 
/* exceptional events? */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952c5c78..9aac0d63d53e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket 
*sock,
struct sock *sk = sock->sk;
__poll_t mask;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
mask = 0;
 
/* exceptional events? */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d56e36a6db7..875858c8b059 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
__poll_t mask;
struct sock *sk = sock->sk;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4491faf83f4f..0b93290c9255 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, 
poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
 
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022f9620..e7b93cd14b52 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1494,7 +1494,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket 
*sock,
struct sock *sk = sock->sk;
__poll_t mask = 0;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
+   sock_poll_wait(file, wait);
 
if (sk->sk_state == IUCV_LISTEN)
return iucv_accept_poll(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ea0c0c6f1874..dd4adf8b1167 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -556,7 +556,7 @@ static __poll_t 

[PATCH 4/5] net: remove sock_poll_busy_loop

2018-07-27 Thread Christoph Hellwig
There is no point in hiding this logic in a helper.  Also remove the
useless events != 0 check and only busy loop once we know we actually
have a poll method.

Signed-off-by: Christoph Hellwig 
---
 include/net/busy_poll.h | 9 -
 net/socket.c| 5 -
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c5187438af38..25d762cf47f2 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,15 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int 
nonblock)
 #endif
 }
 
-static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events)
-{
-   if (sk_can_busy_loop(sock->sk) &&
-   events && (events & POLL_BUSY_LOOP)) {
-   /* once, only if requested by syscall */
-   sk_busy_loop(sock->sk, 1);
-   }
-}
-
 /* if this socket can poll_ll, tell the system call */
 static inline __poll_t sock_poll_busy_flag(struct socket *sock)
 {
diff --git a/net/socket.c b/net/socket.c
index 39e0afbdd797..399d2ccec89d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1130,9 +1130,12 @@ static __poll_t sock_poll(struct file *file, poll_table 
*wait)
struct socket *sock = file->private_data;
__poll_t events = poll_requested_events(wait);
 
-   sock_poll_busy_loop(sock, events);
if (!sock->ops->poll)
return 0;
+
+   /* poll once if requested by the syscall */
+   if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP))
+   sk_busy_loop(sock->sk, 1);
return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
-- 
2.18.0



[PATCH 1/5] net: remove bogus RCU annotations on socket.wq

2018-07-27 Thread Christoph Hellwig
We never use RCU protection for it, just a lot of cargo-cult
rcu_deference_protects calls.

Note that we do keep the kfree_rcu call for it, as the references through
struct sock are RCU protected and thus might require a grace period before
freeing.

Signed-off-by: Christoph Hellwig 
---
 include/linux/net.h |  2 +-
 include/net/sock.h  |  2 +-
 net/socket.c| 10 --
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 6554d3ba4396..e0930678c8bf 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -114,7 +114,7 @@ struct socket {
 
unsigned long   flags;
 
-   struct socket_wq __rcu  *wq;
+   struct socket_wq*wq;
 
struct file *file;
struct sock *sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index b3b75419eafe..ad85d37c83c8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1725,7 +1725,7 @@ static inline void sock_graft(struct sock *sk, struct 
socket *parent)
 {
WARN_ON(parent->sk);
write_lock_bh(>sk_callback_lock);
-   sk->sk_wq = parent->wq;
+   rcu_assign_pointer(sk->sk_wq, parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
diff --git a/net/socket.c b/net/socket.c
index 85633622c94d..39e0afbdd797 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -251,7 +251,7 @@ static struct inode *sock_alloc_inode(struct super_block 
*sb)
init_waitqueue_head(>wait);
wq->fasync_list = NULL;
wq->flags = 0;
-   RCU_INIT_POINTER(ei->socket.wq, wq);
+   ei->socket.wq = wq;
 
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -265,11 +265,9 @@ static struct inode *sock_alloc_inode(struct super_block 
*sb)
 static void sock_destroy_inode(struct inode *inode)
 {
struct socket_alloc *ei;
-   struct socket_wq *wq;
 
ei = container_of(inode, struct socket_alloc, vfs_inode);
-   wq = rcu_dereference_protected(ei->socket.wq, 1);
-   kfree_rcu(wq, rcu);
+   kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -603,7 +601,7 @@ static void __sock_release(struct socket *sock, struct 
inode *inode)
module_put(owner);
}
 
-   if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+   if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
 
if (!sock->file) {
@@ -1172,7 +1170,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -EINVAL;
 
lock_sock(sk);
-   wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
+   wq = sock->wq;
fasync_helper(fd, filp, on, >fasync_list);
 
if (!wq->fasync_list)
-- 
2.18.0



socket poll related cleanups

2018-07-27 Thread Christoph Hellwig
A couple of cleanups I stumbled upon when studying the networking
poll code.


Re: [PATCH net-next v4 1/4] net/sched: user-space can't set unknown tcfa_action values

2018-07-27 Thread Paolo Abeni
On Thu, 2018-07-26 at 21:28 -0300, Marcelo Ricardo Leitner wrote:
> Hi,
> 
> On Thu, Jul 26, 2018 at 04:34:57PM +0200, Paolo Abeni wrote:
> ...
> > @@ -895,6 +904,14 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> > struct tcf_proto *tp,
> > }
> > }
> >  
> > +   if (!tcf_action_valid(a->tcfa_action)) {
> > +   net_warn_ratelimited("invalid %d action value, using "
> > +"TC_ACT_UNSPEC instead", a->tcfa_action);
> 
> Now that it is reporting the error via extack, do we really need this
> warn net_warn?
> extack will be shown as a warning by iproute2 even if the command
> succeeds.

That was requested by Jiri (modulo misinterpretation on my side).
My understanding is that the extact will warn the whoever tryed to push
the bad configuration, while the net_warn is targeting the hosts
administrator.

Jiri, do you have strong opinion on this or did I misinterpret your
wording/ can I drop the net_warn?

Thanks!

> > +   NL_SET_ERR_MSG(extack, "invalid action value, using "
> > +  "TC_ACT_UNSPEC instead");
> 
> Quoted strings shouldn't be broken down into multiple lines..

Thanks, 

will fix in v5 :(

Cheers,

Paolo



[PATCH net-next 8/8] selftests: mlxsw: Add test for trust-DSCP

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

Add a test that exercises the new code. Send DSCP-tagged packets, and
observe how they are prioritized in the switch and the DSCP is updated
on egress again.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 .../drivers/net/mlxsw/qos_dscp_bridge.sh  | 248 ++
 1 file changed, 248 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh 
b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
new file mode 100755
index ..418319f19108
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for DSCP prioritization and rewrite. Packets ingress $swp1 with a DSCP
+# tag and are prioritized according to the map at $swp1. They egress $swp2 and
+# the DSCP value is updated to match the map at that interface. The updated 
DSCP
+# tag is verified at $h2.
+#
+# ICMP responses are produced with the same DSCP tag that arrived at $h2. They
+# go through prioritization at $swp2 and DSCP retagging at $swp1. The tag is
+# verified at $h1--it should match the original tag.
+#
+# +--+ +--+
+# | H1   | |   H2 |
+# |+ $h1 | |$h2 + |
+# || 192.0.2.1/28| |   192.0.2.2/28 | |
+# +|-+ +|-+
+#  ||
+# +||-+
+# | SW || |
+# |  +-||-+   |
+# |  | + $swp1   BR   $swp2 + |   |
+# |  |   APP=0,5,10 .. 7,5,17  APP=0,5,20 .. 7,5,27   |   |
+# |  ++   |
+# +---+
+
+ALL_TESTS="
+   ping_ipv4
+   test_dscp
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+
+__dscp_capture_add_del()
+{
+   local add_del=$1; shift
+   local dev=$1; shift
+   local base=$1; shift
+   local dscp;
+
+   for prio in {0..7}; do
+   dscp=$((base + prio))
+   __icmp_capture_add_del $add_del $dscp "" $dev \
+  "ip_tos $((dscp << 2))"
+   done
+}
+
+dscp_capture_install()
+{
+   local dev=$1; shift
+   local base=$1; shift
+
+   __dscp_capture_add_del add $dev $base
+}
+
+dscp_capture_uninstall()
+{
+   local dev=$1; shift
+   local base=$1; shift
+
+   __dscp_capture_add_del del $dev $base
+}
+
+h1_create()
+{
+   local dscp;
+
+   simple_if_init $h1 192.0.2.1/28
+   tc qdisc add dev $h1 clsact
+   dscp_capture_install $h1 10
+}
+
+h1_destroy()
+{
+   dscp_capture_uninstall $h1 10
+   tc qdisc del dev $h1 clsact
+   simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+   simple_if_init $h2 192.0.2.2/28
+   tc qdisc add dev $h2 clsact
+   dscp_capture_install $h2 20
+}
+
+h2_destroy()
+{
+   dscp_capture_uninstall $h2 20
+   tc qdisc del dev $h2 clsact
+   simple_if_fini $h2 192.0.2.2/28
+}
+
+dscp_map()
+{
+   local base=$1; shift
+
+   for prio in {0..7}; do
+   echo app=$prio,5,$((base + prio))
+   done
+}
+
+lldpad_wait()
+{
+   local dev=$1; shift
+
+   while lldptool -t -i $dev -V APP -c app | grep -q pending; do
+   echo "$dev: waiting for lldpad to push pending APP updates"
+   sleep 5
+   done
+}
+
+switch_create()
+{
+   ip link add name br1 type bridge vlan_filtering 1
+   ip link set dev br1 up
+   ip link set dev $swp1 master br1
+   ip link set dev $swp1 up
+   ip link set dev $swp2 master br1
+   ip link set dev $swp2 up
+
+   lldptool -T -i $swp1 -V APP $(dscp_map 10) >/dev/null
+   lldptool -T -i $swp2 -V APP $(dscp_map 20) >/dev/null
+   lldpad_wait $swp1
+   lldpad_wait $swp2
+}
+
+switch_destroy()
+{
+   lldptool -T -i $swp2 -V APP -d $(dscp_map 20) >/dev/null
+   lldptool -T -i $swp1 -V APP -d $(dscp_map 10) >/dev/null
+
+   # Give lldpad a chance to push down the changes. If the device is downed
+   # too soon, the updates will be left pending, but will have been struck
+   # off the lldpad's DB already, and we won't be able to tell. Then on
+   # next test iteration this would cause weirdness as newly-added APP
+   # rules conflict with the old ones, sometimes getting stuck in an
+  

[PATCH net-next 5/8] mlxsw: reg: Add QoS ReWrite Enable Register

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

This register configures the rewrite enable (whether PCP or DSCP value
in packet should be updated according to packet priority) per receive
port.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index c50e754dd725..02c0e1531ed2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -3367,6 +3367,44 @@ static inline void mlxsw_reg_qeec_pack(char *payload, u8 
local_port,
mlxsw_reg_qeec_next_element_index_set(payload, next_index);
 }
 
+/* QRWE - QoS ReWrite Enable
+ * -
+ * This register configures the rewrite enable per receive port.
+ */
+#define MLXSW_REG_QRWE_ID 0x400F
+#define MLXSW_REG_QRWE_LEN 0x08
+
+MLXSW_REG_DEFINE(qrwe, MLXSW_REG_QRWE_ID, MLXSW_REG_QRWE_LEN);
+
+/* reg_qrwe_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported. No support for router port.
+ */
+MLXSW_ITEM32(reg, qrwe, local_port, 0x00, 16, 8);
+
+/* reg_qrwe_dscp
+ * Whether to enable DSCP rewrite (default is 0, don't rewrite).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qrwe, dscp, 0x04, 1, 1);
+
+/* reg_qrwe_pcp
+ * Whether to enable PCP and DEI rewrite (default is 0, don't rewrite).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qrwe, pcp, 0x04, 0, 1);
+
+static inline void mlxsw_reg_qrwe_pack(char *payload, u8 local_port,
+  bool rewrite_pcp, bool rewrite_dscp)
+{
+   MLXSW_REG_ZERO(qrwe, payload);
+   mlxsw_reg_qrwe_local_port_set(payload, local_port);
+   mlxsw_reg_qrwe_pcp_set(payload, rewrite_pcp);
+   mlxsw_reg_qrwe_dscp_set(payload, rewrite_dscp);
+}
+
 /* QPDPM - QoS Port DSCP to Priority Mapping Register
  * --
  * This register controls the mapping from DSCP field to
@@ -8632,6 +8670,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
MLXSW_REG(qpcr),
MLXSW_REG(qtct),
MLXSW_REG(qeec),
+   MLXSW_REG(qrwe),
MLXSW_REG(qpdpm),
MLXSW_REG(pmlp),
MLXSW_REG(pmtu),
-- 
2.17.1



[PATCH net-next 7/8] mlxsw: spectrum: Support ieee_setapp, ieee_delapp

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

The APP TLVs are used for communicating priority-to-protocol ID maps for
a given netdevice. Support the following APP TLVs:

- DSCP (selector 5) to configure priority-to-DSCP code point maps. Use
  these maps to configure packet priority on ingress, and DSCP code
  point rewrite on egress.

- Default priority (selector 1, PID 0) to configure priority for the
  DSCP code points that don't have one assigned by the DSCP selector. In
  future this could also be used for assigning default port priority
  when a packet arrives without DSCP tagging.

Besides setting up the maps themselves, also configure port trust level
and rewrite bits.

Port trust level determines whether, for a packet arriving through a
certain port, the priority should be determined based on PCP or DSCP
header fields. So far, mlxsw kept the device default of trust-PCP. Now,
as soon as the first DSCP APP TLV is configured, switch to trust-DSCP.
Only when all DSCP APP TLVs are removed, switch back to trust-PCP again.
Note that the default priority APP TLV doesn't impact the trust level
configuration.

Rewrite bits determine whether DSCP and PCP fields of egressing packets
should be updated according to switch priority. When port trust is
switched to DSCP, enable rewrite of DSCP field.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 .../net/ethernet/mellanox/mlxsw/spectrum.h|   4 +-
 .../ethernet/mellanox/mlxsw/spectrum_dcb.c| 269 +-
 2 files changed, 271 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index bc2704193666..13eca1a79d52 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -1,6 +1,6 @@
 /*
  * drivers/net/ethernet/mellanox/mlxsw/spectrum.h
- * Copyright (c) 2015-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2015-2017 Jiri Pirko 
  * Copyright (c) 2015 Ido Schimmel 
  * Copyright (c) 2015 Elad Raz 
@@ -54,6 +54,7 @@
 #include "core.h"
 #include "core_acl_flex_keys.h"
 #include "core_acl_flex_actions.h"
+#include "reg.h"
 
 #define MLXSW_SP_FID_8021D_MAX 1024
 
@@ -243,6 +244,7 @@ struct mlxsw_sp_port {
struct ieee_ets *ets;
struct ieee_maxrate *maxrate;
struct ieee_pfc *pfc;
+   enum mlxsw_reg_qpts_trust_state trust_state;
} dcb;
struct {
u8 module;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
index b6ed7f7c531e..c31aeb25ab5a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
@@ -1,6 +1,6 @@
 /*
  * drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2016 Ido Schimmel 
  *
  * Redistribution and use in source and binary forms, with or without
@@ -255,6 +255,270 @@ static int mlxsw_sp_dcbnl_ieee_setets(struct net_device 
*dev,
return 0;
 }
 
+static int mlxsw_sp_dcbnl_app_validate(struct net_device *dev,
+  struct dcb_app *app)
+{
+   int prio;
+
+   if (app->priority >= IEEE_8021QAZ_MAX_TCS) {
+   netdev_err(dev, "APP entry with priority value %u is invalid\n",
+  app->priority);
+   return -EINVAL;
+   }
+
+   switch (app->selector) {
+   case IEEE_8021QAZ_APP_SEL_DSCP:
+   if (app->protocol >= 64) {
+   netdev_err(dev, "DSCP APP entry with protocol value %u 
is invalid\n",
+  app->protocol);
+   return -EINVAL;
+   }
+
+   /* Warn about any DSCP APP entries with the same PID. */
+   prio = fls(dcb_ieee_getapp_mask(dev, app));
+   if (prio--) {
+   if (prio < app->priority)
+   netdev_warn(dev, "Choosing priority %d for DSCP 
%d in favor of previously-active value of %d\n",
+   app->priority, app->protocol, prio);
+   else if (prio > app->priority)
+   netdev_warn(dev, "Ignoring new priority %d for 
DSCP %d in favor of current value of %d\n",
+   app->priority, app->protocol, prio);
+   }
+   break;
+
+   case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+   if (app->protocol) {
+   netdev_err(dev, "EtherType APP entries with protocol 
value != 0 not supported\n");
+   return -EINVAL;
+   }
+   break;
+
+   default:
+   

[PATCH net-next 6/8] mlxsw: reg: Add QoS Priority to DSCP Mapping Register

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

This register controls mapping from Priority to DSCP for purposes of
rewrite. Note that rewrite happens as the packet is transmitted provided
that the DSCP rewrite bit is enabled for the packet.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 89 +++
 1 file changed, 89 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 02c0e1531ed2..e52841627966 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -3405,6 +3405,94 @@ static inline void mlxsw_reg_qrwe_pack(char *payload, u8 
local_port,
mlxsw_reg_qrwe_dscp_set(payload, rewrite_dscp);
 }
 
+/* QPDSM - QoS Priority to DSCP Mapping
+ * 
+ * QoS Priority to DSCP Mapping Register
+ */
+#define MLXSW_REG_QPDSM_ID 0x4011
+#define MLXSW_REG_QPDSM_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN 0x4 /* record length */
+#define MLXSW_REG_QPDSM_PRIO_ENTRY_REC_MAX_COUNT 16
+#define MLXSW_REG_QPDSM_LEN (MLXSW_REG_QPDSM_BASE_LEN +
\
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN *   \
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(qpdsm, MLXSW_REG_QPDSM_ID, MLXSW_REG_QPDSM_LEN);
+
+/* reg_qpdsm_local_port
+ * Local Port. Supported for data packets from CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpdsm, local_port, 0x00, 16, 8);
+
+/* reg_qpdsm_prio_entry_color0_e
+ * Enable update of the entry for color 0 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color0_e,
+MLXSW_REG_QPDSM_BASE_LEN, 31, 1,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color0_dscp
+ * DSCP field in the outer label of the packet for color 0 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color0_dscp,
+MLXSW_REG_QPDSM_BASE_LEN, 24, 6,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color1_e
+ * Enable update of the entry for color 1 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color1_e,
+MLXSW_REG_QPDSM_BASE_LEN, 23, 1,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color1_dscp
+ * DSCP field in the outer label of the packet for color 1 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color1_dscp,
+MLXSW_REG_QPDSM_BASE_LEN, 16, 6,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color2_e
+ * Enable update of the entry for color 2 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color2_e,
+MLXSW_REG_QPDSM_BASE_LEN, 15, 1,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color2_dscp
+ * DSCP field in the outer label of the packet for color 2 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color2_dscp,
+MLXSW_REG_QPDSM_BASE_LEN, 8, 6,
+MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_qpdsm_pack(char *payload, u8 local_port)
+{
+   MLXSW_REG_ZERO(qpdsm, payload);
+   mlxsw_reg_qpdsm_local_port_set(payload, local_port);
+}
+
+static inline void
+mlxsw_reg_qpdsm_prio_pack(char *payload, unsigned short prio, u8 dscp)
+{
+   mlxsw_reg_qpdsm_prio_entry_color0_e_set(payload, prio, 1);
+   mlxsw_reg_qpdsm_prio_entry_color0_dscp_set(payload, prio, dscp);
+   mlxsw_reg_qpdsm_prio_entry_color1_e_set(payload, prio, 1);
+   mlxsw_reg_qpdsm_prio_entry_color1_dscp_set(payload, prio, dscp);
+   mlxsw_reg_qpdsm_prio_entry_color2_e_set(payload, prio, 1);
+   mlxsw_reg_qpdsm_prio_entry_color2_dscp_set(payload, prio, dscp);
+}
+
 /* QPDPM - QoS Port DSCP to Priority Mapping Register
  * --
  * This register controls the mapping from DSCP field to
@@ -8671,6 +8759,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
MLXSW_REG(qtct),
MLXSW_REG(qeec),
MLXSW_REG(qrwe),
+   MLXSW_REG(qpdsm),
MLXSW_REG(qpdpm),
MLXSW_REG(pmlp),
MLXSW_REG(pmtu),
-- 
2.17.1



[PATCH net-next 3/8] mlxsw: reg: Add QoS Port DSCP to Priority Mapping Register

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

The QPDPM register controls the mapping from DSCP field to Switch
Priority for IP packets.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 52 +++
 1 file changed, 52 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index fd2e3dd166d2..411d06b5aaae 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -3329,6 +3329,57 @@ static inline void mlxsw_reg_qeec_pack(char *payload, u8 
local_port,
mlxsw_reg_qeec_next_element_index_set(payload, next_index);
 }
 
+/* QPDPM - QoS Port DSCP to Priority Mapping Register
+ * --
+ * This register controls the mapping from DSCP field to
+ * Switch Priority for IP packets.
+ */
+#define MLXSW_REG_QPDPM_ID 0x4013
+#define MLXSW_REG_QPDPM_BASE_LEN 0x4 /* base length, without records */
+#define MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN 0x2 /* record length */
+#define MLXSW_REG_QPDPM_DSCP_ENTRY_REC_MAX_COUNT 64
+#define MLXSW_REG_QPDPM_LEN (MLXSW_REG_QPDPM_BASE_LEN +
\
+MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN *   \
+MLXSW_REG_QPDPM_DSCP_ENTRY_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(qpdpm, MLXSW_REG_QPDPM_ID, MLXSW_REG_QPDPM_LEN);
+
+/* reg_qpdpm_local_port
+ * Local Port. Supported for data packets from CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpdpm, local_port, 0x00, 16, 8);
+
+/* reg_qpdpm_dscp_e
+ * Enable update of the specific entry. When cleared, the switch_prio and color
+ * fields are ignored and the previous switch_prio and color values are
+ * preserved.
+ * Access: WO
+ */
+MLXSW_ITEM16_INDEXED(reg, qpdpm, dscp_entry_e, MLXSW_REG_QPDPM_BASE_LEN, 15, 1,
+MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdpm_dscp_prio
+ * The new Switch Priority value for the relevant DSCP value.
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, qpdpm, dscp_entry_prio,
+MLXSW_REG_QPDPM_BASE_LEN, 0, 4,
+MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_qpdpm_pack(char *payload, u8 local_port)
+{
+   MLXSW_REG_ZERO(qpdpm, payload);
+   mlxsw_reg_qpdpm_local_port_set(payload, local_port);
+}
+
+static inline void
+mlxsw_reg_qpdpm_dscp_pack(char *payload, unsigned short dscp, u8 prio)
+{
+   mlxsw_reg_qpdpm_dscp_entry_e_set(payload, dscp, 1);
+   mlxsw_reg_qpdpm_dscp_entry_prio_set(payload, dscp, prio);
+}
+
 /* PMLP - Ports Module to Local Port Register
  * --
  * Configures the assignment of modules to local ports.
@@ -8542,6 +8593,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
MLXSW_REG(qpcr),
MLXSW_REG(qtct),
MLXSW_REG(qeec),
+   MLXSW_REG(qpdpm),
MLXSW_REG(pmlp),
MLXSW_REG(pmtu),
MLXSW_REG(ptys),
-- 
2.17.1



[PATCH net-next 2/8] net: dcb: Add priority-to-DSCP map getters

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

On ingress, a network device such as a switch assigns to packets
priority based on various criteria. Common options include interpreting
PCP and DSCP fields according to user configuration. When a packet
egresses the switch, a reverse process may rewrite PCP and/or DSCP
values according to packet priority.

The following three functions support a) obtaining a DSCP-to-priority
map or vice versa, and b) finding default-priority entries in APP
database.

The DCB subsystem supports for APP entries a very generous M:N mapping
between priorities and protocol identifiers. Understandably,
several (say) DSCP values can map to the same priority. But this
asymmetry holds the other way around as well--one priority can map to
several DSCP values. For this reason, the following functions operate in
terms of bitmaps, with ones in positions that match some APP entry.

- dcb_ieee_getapp_dscp_prio_mask_map() to compute for a given netdevice
  a map of DSCP-to-priority-mask, which gives for each DSCP value a
  bitmap of priorities related to that DSCP value by APP, along the
  lines of dcb_ieee_getapp_mask().

- dcb_ieee_getapp_prio_dscp_mask_map() similarly to compute for a given
  netdevice a map from priorities to a bitmap of DSCPs.

- dcb_ieee_getapp_default_prio_mask() which finds all default-priority
  rules for a given port in APP database, and returns a mask of
  priorities allowed by these default-priority rules.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 include/net/dcbnl.h | 13 +++
 net/dcb/dcbnl.c | 86 +
 2 files changed, 99 insertions(+)

diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 0e5e91be2d30..e22a8a3c089b 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -34,6 +34,19 @@ int dcb_ieee_setapp(struct net_device *, struct dcb_app *);
 int dcb_ieee_delapp(struct net_device *, struct dcb_app *);
 u8 dcb_ieee_getapp_mask(struct net_device *, struct dcb_app *);
 
+struct dcb_ieee_app_prio_map {
+   u64 map[IEEE_8021QAZ_MAX_TCS];
+};
+void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
+   struct dcb_ieee_app_prio_map *p_map);
+
+struct dcb_ieee_app_dscp_map {
+   u8 map[64];
+};
+void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
+   struct dcb_ieee_app_dscp_map *p_map);
+u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev);
+
 int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd,
  u32 seq, u32 pid);
 int dcbnl_cee_notify(struct net_device *dev, int event, int cmd,
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 013fdb6fa07a..a556cd708885 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1958,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct 
dcb_app *del)
 }
 EXPORT_SYMBOL(dcb_ieee_delapp);
 
+/**
+ * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority. Initialize p_map
+ * such that each map element holds a bit mask of DSCP values configured for
+ * that priority by APP entries.
+ */
+void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
+   struct dcb_ieee_app_prio_map *p_map)
+{
+   int ifindex = dev->ifindex;
+   struct dcb_app_type *itr;
+   u8 prio;
+
+   memset(p_map->map, 0, sizeof(p_map->map));
+
+   spin_lock_bh(_lock);
+   list_for_each_entry(itr, _app_list, list) {
+   if (itr->ifindex == ifindex &&
+   itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+   itr->app.protocol < 64 &&
+   itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+   prio = itr->app.priority;
+   p_map->map[prio] |= 1ULL << itr->app.protocol;
+   }
+   }
+   spin_unlock_bh(_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
+
+/**
+ * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
+ * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
+ * such that each map element holds a bit mask of priorities configured for a
+ * given DSCP value by APP entries.
+ */
+void
+dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
+  struct dcb_ieee_app_dscp_map *p_map)
+{
+   int ifindex = dev->ifindex;
+   struct dcb_app_type *itr;
+
+   memset(p_map->map, 0, sizeof(p_map->map));
+
+   spin_lock_bh(_lock);
+   list_for_each_entry(itr, _app_list, list) {
+   if (itr->ifindex == ifindex &&
+   itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+   itr->app.protocol < 64 &&
+   itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+   p_map->map[itr->app.protocol] |= 1 << 

[PATCH net-next 4/8] mlxsw: reg: Add QoS Priority Trust State Register

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

The QPTS register controls the port policy to calculate the switch
priority and packet color based on incoming packet fields.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 411d06b5aaae..c50e754dd725 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -3017,6 +3017,44 @@ static inline void mlxsw_reg_iedr_rec_pack(char 
*payload, int rec_index,
mlxsw_reg_iedr_rec_index_start_set(payload, rec_index, rec_index_start);
 }
 
+/* QPTS - QoS Priority Trust State Register
+ * 
+ * This register controls the port policy to calculate the switch priority and
+ * packet color based on incoming packet fields.
+ */
+#define MLXSW_REG_QPTS_ID 0x4002
+#define MLXSW_REG_QPTS_LEN 0x8
+
+MLXSW_REG_DEFINE(qpts, MLXSW_REG_QPTS_ID, MLXSW_REG_QPTS_LEN);
+
+/* reg_qpts_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported.
+ */
+MLXSW_ITEM32(reg, qpts, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_qpts_trust_state {
+   MLXSW_REG_QPTS_TRUST_STATE_PCP = 1,
+   MLXSW_REG_QPTS_TRUST_STATE_DSCP = 2, /* For MPLS, trust EXP. */
+};
+
+/* reg_qpts_trust_state
+ * Trust state for a given port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpts, trust_state, 0x04, 0, 3);
+
+static inline void mlxsw_reg_qpts_pack(char *payload, u8 local_port,
+  enum mlxsw_reg_qpts_trust_state ts)
+{
+   MLXSW_REG_ZERO(qpts, payload);
+
+   mlxsw_reg_qpts_local_port_set(payload, local_port);
+   mlxsw_reg_qpts_trust_state_set(payload, ts);
+}
+
 /* QPCR - QoS Policer Configuration Register
  * -
  * The QPCR register is used to create policers - that limit
@@ -8590,6 +8628,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
MLXSW_REG(percr),
MLXSW_REG(pererp),
MLXSW_REG(iedr),
+   MLXSW_REG(qpts),
MLXSW_REG(qpcr),
MLXSW_REG(qtct),
MLXSW_REG(qeec),
-- 
2.17.1



[PATCH net-next 0/8] mlxsw: Support DSCP prioritization and rewrite

2018-07-27 Thread Ido Schimmel
Petr says:

On ingress, a network device such as a switch assigns to packets
priority based on various criteria. Common options include interpreting
PCP and DSCP fields according to user configuration. When a packet
egresses the switch, a reverse process may rewrite PCP and/or DSCP
headers according to packet priority.

So far, mlxsw has supported prioritization based on PCP (802.1p priority
tag). This patch set introduces support for prioritization based on
DSCP, and DSCP rewrite.

To configure the DSCP-to-priority maps, the user is expected to invoke
ieee_setapp and ieee_delapp DCBNL ops, e.g. by using lldptool:

# lldptool -T -i sw1p6 -V APP app=3,5,24 # (priority 3, DSCP, 24)

To decide whether or not to pay attention to DSCP values, the Spectrum
switch recognize a per-port configuration of trust level. Until the
first APP rule is added for a given port, this port's trust level stays
at PCP, meaning that PCP is used for packet prioritization. With the
first DSCP APP rule, the port is configured to trust DSCP instead, and
it stays there until all DSCP APP rules are removed again.

Besides the DSCP (value 5) selector, another selector that plays into
packet prioritization is Ethernet type (value 1) with PID of 0. Such APP
entries denote default priority[1]:

# lldptool -T -i sw1p6 -V APP app=3,1,0 # (default priority 3)

With this patch set, mlxsw uses these values to configure priority for
DSCP values not explicitly specified in DSCP APP map. In the future we
expect to also use this to configure default port priority for untagged
packets.

Access to DSCP-to-priority map, priority-to-DSCP map, and default
priority for a port is exposed through three new DCB helpers. Like the
already-existing dcb_ieee_getapp_mask() helper, these helpers operate in
terms of bitmaps, to support the arbitrary M:N mapping that the APP
rules allow. Such interface presents all the relevant information from
the APP database without necessitating exposition of iterators, locking
or other complex primitives. It is up to the driver to then digest the
mapping in a way that the device supports. In this patch set, mlxsw
resolves conflicts by favoring higher-numbered DSCP values and
priorities.

In this patchset:

- Patch #1 fixes a bug in DCB APP database management.
- Patch #2 adds the getters described above.
- Patches #3-#6 add Spectrum configuration registers.
- Patch #7 adds the mlxsw logic that configures the device according to
  APP rules.
- Patch #8 adds a self-test. The test is added to the subdirectory
  drivers/net/mlxsw. Even though it's not particularly specific to
  mlxsw, it's not suitable for running on soft devices (which don't
  support the ieee_getapp et.al.), and thus isn't a good fit for the
  general net/forwarding directory.

[1] 802.1Q-2014, Table D-9

Petr Machata (8):
  net: dcb: For wild-card lookups, use priority -1, not 0
  net: dcb: Add priority-to-DSCP map getters
  mlxsw: reg: Add QoS Port DSCP to Priority Mapping Register
  mlxsw: reg: Add QoS Priority Trust State Register
  mlxsw: reg: Add QoS ReWrite Enable Register
  mlxsw: reg: Add QoS Priority to DSCP Mapping Register
  mlxsw: spectrum: Support ieee_setapp, ieee_delapp
  selftests: mlxsw: Add test for trust-DSCP

 drivers/net/ethernet/mellanox/mlxsw/reg.h | 219 ++
 .../net/ethernet/mellanox/mlxsw/spectrum.h|   4 +-
 .../ethernet/mellanox/mlxsw/spectrum_dcb.c| 269 +-
 include/net/dcbnl.h   |  13 +
 net/dcb/dcbnl.c   |  97 ++-
 .../drivers/net/mlxsw/qos_dscp_bridge.sh  | 248 
 6 files changed, 844 insertions(+), 6 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh

-- 
2.17.1



[PATCH net-next 1/8] net: dcb: For wild-card lookups, use priority -1, not 0

2018-07-27 Thread Ido Schimmel
From: Petr Machata 

The function dcb_app_lookup walks the list of specified DCB APP entries,
looking for one that matches a given criteria: ifindex, selector,
protocol ID and optionally also priority. The "don't care" value for
priority is set to 0, because that priority has not been allowed under
CEE regime, which predates the IEEE standardization.

Under IEEE, 0 is a valid priority number. But because dcb_app_lookup
considers zero a wild card, attempts to add an APP entry with priority 0
fail when other entries exist for a given ifindex / selector / PID
triplet.

Fix by changing the wild-card value to -1.

Signed-off-by: Petr Machata 
Signed-off-by: Ido Schimmel 
---
 net/dcb/dcbnl.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 2589a6b78aa1..013fdb6fa07a 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1786,7 +1786,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct 
dcb_app *app,
if (itr->app.selector == app->selector &&
itr->app.protocol == app->protocol &&
itr->ifindex == ifindex &&
-   (!prio || itr->app.priority == prio))
+   ((prio == -1) || itr->app.priority == prio))
return itr;
}
 
@@ -1821,7 +1821,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
u8 prio = 0;
 
spin_lock_bh(_lock);
-   if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+   itr = dcb_app_lookup(app, dev->ifindex, -1);
+   if (itr)
prio = itr->app.priority;
spin_unlock_bh(_lock);
 
@@ -1849,7 +1850,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app 
*new)
 
spin_lock_bh(_lock);
/* Search for existing match and replace */
-   if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
+   itr = dcb_app_lookup(new, dev->ifindex, -1);
+   if (itr) {
if (new->priority)
itr->app.priority = new->priority;
else {
@@ -1882,7 +1884,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct 
dcb_app *app)
u8 prio = 0;
 
spin_lock_bh(_lock);
-   if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+   itr = dcb_app_lookup(app, dev->ifindex, -1);
+   if (itr)
prio |= 1 << itr->app.priority;
spin_unlock_bh(_lock);
 
-- 
2.17.1



[PATCH iproute2 0/3] l2tp: remove unused fields in struct l2tp_parm

2018-07-27 Thread Guillaume Nault
Several fields of struct l2tp_parm are handled by create_session() but
can't actually be set by user.
Most of these fields can also be set by get_response(), but are ignored
afterwards.

Since these fields can't have any visible effect, let's just remove
them.

Guillaume Nault (3):
  l2tp: drop data_seq
  l2tp: drop mtu
  l2tp: drop lns_mode

 ip/ipl2tp.c | 13 -
 1 file changed, 13 deletions(-)

-- 
2.18.0



[PATCH iproute2 3/3] l2tp: drop lns_mode

2018-07-27 Thread Guillaume Nault
This option is never set.

Signed-off-by: Guillaume Nault 
---
 ip/ipl2tp.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ip/ipl2tp.c b/ip/ipl2tp.c
index 41fefb85..5e7f0390 100644
--- a/ip/ipl2tp.c
+++ b/ip/ipl2tp.c
@@ -58,7 +58,6 @@ struct l2tp_parm {
unsigned int udp_csum:1;
unsigned int recv_seq:1;
unsigned int send_seq:1;
-   unsigned int lns_mode:1;
unsigned int tunnel:1;
unsigned int session:1;
int reorder_timeout;
@@ -161,8 +160,6 @@ static int create_session(struct l2tp_parm *p)
addattr8(, 1024, L2TP_ATTR_RECV_SEQ, 1);
if (p->send_seq)
addattr8(, 1024, L2TP_ATTR_SEND_SEQ, 1);
-   if (p->lns_mode)
-   addattr(, 1024, L2TP_ATTR_LNS_MODE);
if (p->reorder_timeout)
addattr64(, 1024, L2TP_ATTR_RECV_TIMEOUT,
  p->reorder_timeout);
-- 
2.18.0



[PATCH iproute2 2/3] l2tp: drop mtu

2018-07-27 Thread Guillaume Nault
This option can't be set by user and is never printed.

Signed-off-by: Guillaume Nault 
---
 ip/ipl2tp.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/ip/ipl2tp.c b/ip/ipl2tp.c
index aca9912c..41fefb85 100644
--- a/ip/ipl2tp.c
+++ b/ip/ipl2tp.c
@@ -53,7 +53,6 @@ struct l2tp_parm {
inet_prefix peer_ip;
 
uint16_t pw_type;
-   uint16_t mtu;
unsigned int udp6_csum_tx:1;
unsigned int udp6_csum_rx:1;
unsigned int udp_csum:1;
@@ -158,8 +157,6 @@ static int create_session(struct l2tp_parm *p)
addattr8(, 1024, L2TP_ATTR_L2SPEC_TYPE, p->l2spec_type);
addattr8(, 1024, L2TP_ATTR_L2SPEC_LEN, p->l2spec_len);
 
-   if (p->mtu)
-   addattr16(, 1024, L2TP_ATTR_MTU, p->mtu);
if (p->recv_seq)
addattr8(, 1024, L2TP_ATTR_RECV_SEQ, 1);
if (p->send_seq)
@@ -413,8 +410,6 @@ static int get_response(struct nlmsghdr *n, void *arg)
p->local_udp_port = rta_getattr_u16(attrs[L2TP_ATTR_UDP_SPORT]);
if (attrs[L2TP_ATTR_UDP_DPORT])
p->peer_udp_port = rta_getattr_u16(attrs[L2TP_ATTR_UDP_DPORT]);
-   if (attrs[L2TP_ATTR_MTU])
-   p->mtu = rta_getattr_u16(attrs[L2TP_ATTR_MTU]);
if (attrs[L2TP_ATTR_IFNAME])
p->ifname = rta_getattr_str(attrs[L2TP_ATTR_IFNAME]);
 
-- 
2.18.0



[PATCH iproute2 1/3] l2tp: drop data_seq

2018-07-27 Thread Guillaume Nault
This option can't be set by user and is never printed. Furthermore,
L2TP_ATTR_DATA_SEQ has always been a noop in Linux.

Signed-off-by: Guillaume Nault 
---
 ip/ipl2tp.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/ip/ipl2tp.c b/ip/ipl2tp.c
index 05e96387..aca9912c 100644
--- a/ip/ipl2tp.c
+++ b/ip/ipl2tp.c
@@ -60,7 +60,6 @@ struct l2tp_parm {
unsigned int recv_seq:1;
unsigned int send_seq:1;
unsigned int lns_mode:1;
-   unsigned int data_seq:2;
unsigned int tunnel:1;
unsigned int session:1;
int reorder_timeout;
@@ -167,8 +166,6 @@ static int create_session(struct l2tp_parm *p)
addattr8(, 1024, L2TP_ATTR_SEND_SEQ, 1);
if (p->lns_mode)
addattr(, 1024, L2TP_ATTR_LNS_MODE);
-   if (p->data_seq)
-   addattr8(, 1024, L2TP_ATTR_DATA_SEQ, p->data_seq);
if (p->reorder_timeout)
addattr64(, 1024, L2TP_ATTR_RECV_TIMEOUT,
  p->reorder_timeout);
@@ -359,8 +356,6 @@ static int get_response(struct nlmsghdr *n, void *arg)
p->pw_type = rta_getattr_u16(attrs[L2TP_ATTR_PW_TYPE]);
if (attrs[L2TP_ATTR_ENCAP_TYPE])
p->encap = rta_getattr_u16(attrs[L2TP_ATTR_ENCAP_TYPE]);
-   if (attrs[L2TP_ATTR_DATA_SEQ])
-   p->data_seq = rta_getattr_u16(attrs[L2TP_ATTR_DATA_SEQ]);
if (attrs[L2TP_ATTR_CONN_ID])
p->tunnel_id = rta_getattr_u32(attrs[L2TP_ATTR_CONN_ID]);
if (attrs[L2TP_ATTR_PEER_CONN_ID])
-- 
2.18.0



pull-request: can-next 2018-01-16

2018-07-27 Thread Marc Kleine-Budde
Hello David,

this is a pull request for net-next/master consisting of 38 patches.

Dan Murphy's patch fixes the path to a file in the comment of the CAN
Error Message Frame Mask structure.

A patch by Colin Ian King fixes a typo in the cc770 driver.

The next patch is by me an sorts the Kconfigand Makefile entries of the
CAN-USB driver subdir alphabetically.

The patch by Jakob Unterwurzacher adds support for the UCAN USB-CAN
adapter.

YueHaibing's patch replaces a open coded skb_put()+memset() by
skb_put_zero() in the CAN-dev infrastructure.

Zhu Yi provides a patch to enable multi-queue CAN devices.

Three patches by Luc Van Oostenryck fix the return value of several
driver's xmit function, I contribute a patch for the a fourth driver.

Fabio Estevam's patch switches the flexcan driver to SPDX identifier.

Two patches by Jia-Ju Bai replace the mdelay() by a usleep_range() in
the sja1000 drivers.

The next 6 patches are by Anssi Hannula and refactor the xilinx CAN
driver and add support for the xilinx CAN FD core.

A patch by Gustavo A. R. Silva adds fallthrough annotation to the
peak_usb driver.

5 patches by Stephane Grosjean for the peak CANFD driver do some
cleanups and provide more improvements for further firmware releases.

The remaining 13 patches are by Jimmy Assarsson and the first clean up
the kvaser_usb driver, so that the later patches add support for the
Kvaser USB hydra family.

regards,
Marc

---

The following changes since commit ecbcd689d74a394b711d2360aef7e5d007ec9d98:

  Merge tag 'mlx5e-updates-2018-07-26' of 
git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux (2018-07-26 21:33:24 
-0700)

are available in the Git repository at:

  
ssh://g...@gitolite.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git 
tags/linux-can-next-for-4.19-20180727

for you to fetch changes up to 1f6ed42c742e8d1cfd3811ef7a134eaa75a511d6:

  can: kvaser_usb: Simplify struct kvaser_cmd_cardinfo (2018-07-27 10:40:19 
+0200)


linux-can-next-for-4.19-20180727


Anssi Hannula (6):
  can: xilinx_can: only report warning and passive states on state changes
  can: xilinx_can: use can_change_state()
  can: xilinx_can: update stats.tx_bytes after transmission
  dt-bindings: can: xilinx_can: add Xilinx CAN FD bindings
  can: xilinx_can: refactor code in preparation for CAN FD support
  can: xilinx_can: add support for Xilinx CAN FD core

Colin Ian King (1):
  can: cc770: fix spelling mistake: "comptibility" -> "compatibility"

Dan Murphy (1):
  can: uapi: can.h: Fix can error class mask dir path

Fabio Estevam (1):
  can: flexcan: Switch to SPDX identifier

Gustavo A. R. Silva (1):
  can: peak_usb: mark expected switch fall-throughs

Jakob Unterwurzacher (1):
  can: ucan: add driver for Theobroma Systems UCAN devices

Jia-Ju Bai (2):
  can: sja1000: Replace mdelay with usleep_range in peak_pci_probe
  can: sja1000: Replace mdelay with usleep_range in pcan_add_channels

Jimmy Assarsson (13):
  can: kvaser_usb: Remove unnecessary return
  can: kvaser_usb: Remove unused commands and defines
  can: kvaser_usb: Rename message/msg to command/cmd
  can: kvaser_usb: Replace USB timeout constants with one define
  can: kvaser_usb: Add pointer to struct usb_interface into struct 
kvaser_usb
  can: kvaser_usb: Refactor kvaser_usb_get_endpoints()
  can: kvaser_usb: Refactor kvaser_usb_init_one()
  can: kvaser_usb: Improve logging messages
  can: kvaser_usb: Fix typos
  can: kvaser_usb: Add SPDX GPL-2.0 license identifier
  can: kvaser_usb: Split driver into kvaser_usb_core.c and kvaser_usb_leaf.c
  can: kvaser_usb: Add support for Kvaser USB hydra family
  can: kvaser_usb: Simplify struct kvaser_cmd_cardinfo

Luc Van Oostenryck (3):
  can: janz-ican3: fix ican3_xmit()'s return type
  can: sun4i: fix sun4ican_start_xmit()'s return type
  can: xilinx: fix xcan_start_xmit()'s return type

Marc Kleine-Budde (2):
  can: usb: Kconfig/Makefile: sort alphabetically
  can: flexcan: fix flexcan_start_xmit()'s return type

Stephane Grosjean (5):
  can: peak_canfd: improves 32-bit alignment
  can: peak_canfd: remove useless defined symbols
  can: peak_canfd: use ndev irq instead of pci_dev one
  can: peak_canfd: fix typo in error message
  can: peak_canfd: rearrange the way resources are released

YueHaibing (1):
  can: dev: use skb_put_zero to simplfy code

Zhu Yi (1):
  can: dev: enable multi-queue for SocketCAN devices

 .../devicetree/bindings/net/can/xilinx_can.txt |   35 +-
 Documentation/networking/can_ucan_protocol.rst |  332 
 Documentation/networking/index.rst |1 +
 drivers/net/can/cc770/cc770.c  |2 +-
 drivers/net/can/dev.c  |  

RE: [net-next v5 3/3] net/tls: Remove redundant array allocation.

2018-07-27 Thread Vakul Garg



> -Original Message-
> From: Dave Watson [mailto:davejwat...@fb.com]
> Sent: Thursday, July 26, 2018 2:31 AM
> To: Vakul Garg 
> Cc: David Miller ; netdev@vger.kernel.org;
> bor...@mellanox.com; avia...@mellanox.com; Doron Roberts-Kedes
> 
> Subject: Re: [net-next v5 3/3] net/tls: Remove redundant array allocation.
> 
> On 07/24/18 08:22 AM, Vakul Garg wrote:
> > > I don't think this patch is safe as-is.  sgin_arr is a stack array
> > > of size MAX_SKB_FRAGS (+ overhead), while my read of skb_cow_data is
> > > that it walks the whole chain of skbs from skb->next, and can return
> > > any number of segments.  Therefore we need to heap allocate.  I
> > > think I copied the IPSEC code here.
> > >
> > > For perf though, we could use the stack array if skb_cow_data
> > > returns <= MAX_SKB_FRAGS.
> > >
> > > This code is slightly confusing though, since we don't heap allocate
> > > in the zerocopy case - what happens is that skb_to_sgvec returns
> > > -EMSGSIZE, and we fall back to the non-zerocopy case, and return
> > > again to this function, where we then hit the skb_cow_data path and
> heap allocate.
> >
> > Thanks for explaining.
> > I am missing the point why MAX_SKB_FRAGS sized local array sgin has
> > been used in tls_sw_recvmsg(). What is special about MAX_SKB_FRAGS so
> > that we used it as a size factor for 'sgin'?
> 
> There is nothing special about it, in the zerocopy-fastpath if we happen to 
> fit
> in MAX_SKB_FRAGS we avoid any kmalloc whatsoever though.
> It could be renamed MAX_SC_FOR_FASTPATH or something.
> 
> > Will it be a bad idea to get rid of array 'sgin' on stack and simply
> > kmalloc 'sgin' for whatever the number the number of pages returned by
> iov_iter_npages()?
> > We can allocate for sgout too with the same kmalloc().
> >
> > (Using a local array based 'sgin' is coming in the way to achieve
> > sending multiple async record decryption requests to the accelerator
> > without waiting for previous one to complete.)
> 
> Yes we could do this, and yes we would need to heap allocate if you want to
> support multiple outstanding decryption requests.  I think async crypto
> prevents any sort of zerocopy-fastpath, however.

We already do a aead_request_alloc (which internally does kmalloc).
To mitigate the cost of kmalloc/kfree for sg lists and aad, I am allocating a 
combined memory chunk for all of these and then segmenting it into
aead_req, aad, sgin, sgout. This way there should be no extra cost for
memory allocations in non-async.
 



RE: Deadlock with restart_syscall()

2018-07-27 Thread André Pribil
> I've found this thread, where a similar issue with restart_syscall()
> has been reported:
> https://www.spinics.net/lists/netdev/msg415144.html

Found another old report about restart_syscall() producing a dead loop:
https://lists.gt.net/linux/kernel/2371438

I do not agree with the conclusion there that the user space should be 
to blame for this. I also see no ugly priority games in my scenario.

No one who wants to say anything about this?

Andre


[PATCH net-next] cxgb4: print ULD queue information managed by LLD

2018-07-27 Thread Rahul Lakkireddy
Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 296 +++--
 1 file changed, 277 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
index 2320f7829a6b..6f312e03432f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -2474,16 +2474,64 @@ static inline struct port_info *ethqset2pinfo(struct 
adapter *adap, int qset)
return NULL;
 }
 
+static int sge_qinfo_uld_txq_entries(const struct adapter *adap, int uld)
+{
+   const struct sge_uld_txq_info *utxq_info = adap->sge.uld_txq_info[uld];
+
+   if (!utxq_info)
+   return 0;
+
+   return DIV_ROUND_UP(utxq_info->ntxq, 4);
+}
+
+static int sge_qinfo_uld_rspq_entries(const struct adapter *adap, int uld,
+ bool ciq)
+{
+   const struct sge_uld_rxq_info *urxq_info = adap->sge.uld_rxq_info[uld];
+
+   if (!urxq_info)
+   return 0;
+
+   return ciq ? DIV_ROUND_UP(urxq_info->nciq, 4) :
+DIV_ROUND_UP(urxq_info->nrxq, 4);
+}
+
+static int sge_qinfo_uld_rxq_entries(const struct adapter *adap, int uld)
+{
+   return sge_qinfo_uld_rspq_entries(adap, uld, false);
+}
+
+static int sge_qinfo_uld_ciq_entries(const struct adapter *adap, int uld)
+{
+   return sge_qinfo_uld_rspq_entries(adap, uld, true);
+}
+
 static int sge_qinfo_show(struct seq_file *seq, void *v)
 {
+   int uld_rxq_entries[CXGB4_ULD_MAX] = { 0 };
+   int uld_ciq_entries[CXGB4_ULD_MAX] = { 0 };
+   int uld_txq_entries[CXGB4_TX_MAX] = { 0 };
+   const struct sge_uld_txq_info *utxq_info;
+   const struct sge_uld_rxq_info *urxq_info;
struct adapter *adap = seq->private;
-   int eth_entries = DIV_ROUND_UP(adap->sge.ethqsets, 4);
-   int ofld_entries = DIV_ROUND_UP(adap->sge.ofldqsets, 4);
-   int ctrl_entries = DIV_ROUND_UP(MAX_CTRL_QUEUES, 4);
-   int i, r = (uintptr_t)v - 1;
-   int ofld_idx = r - eth_entries;
-   int ctrl_idx =  ofld_idx - ofld_entries;
-   int fq_idx =  ctrl_idx - ctrl_entries;
+   int i, n, r = (uintptr_t)v - 1;
+   int eth_entries, ctrl_entries;
+   struct sge *s = >sge;
+
+   eth_entries = DIV_ROUND_UP(adap->sge.ethqsets, 4);
+   ctrl_entries = DIV_ROUND_UP(MAX_CTRL_QUEUES, 4);
+
+   mutex_lock(_mutex);
+   if (s->uld_txq_info)
+   for (i = 0; i < ARRAY_SIZE(uld_txq_entries); i++)
+   uld_txq_entries[i] = sge_qinfo_uld_txq_entries(adap, i);
+
+   if (s->uld_rxq_info) {
+   for (i = 0; i < ARRAY_SIZE(uld_rxq_entries); i++) {
+   uld_rxq_entries[i] = sge_qinfo_uld_rxq_entries(adap, i);
+   uld_ciq_entries[i] = sge_qinfo_uld_ciq_entries(adap, i);
+   }
+   }
 
if (r)
seq_putc(seq, '\n');
@@ -2505,9 +2553,10 @@ do { \
 
if (r < eth_entries) {
int base_qset = r * 4;
-   const struct sge_eth_rxq *rx = >sge.ethrxq[base_qset];
-   const struct sge_eth_txq *tx = >sge.ethtxq[base_qset];
-   int n = min(4, adap->sge.ethqsets - 4 * r);
+   const struct sge_eth_rxq *rx = >ethrxq[base_qset];
+   const struct sge_eth_txq *tx = >ethtxq[base_qset];
+
+   n = min(4, s->ethqsets - 4 * r);
 
S("QType:", "Ethernet");
S("Interface:",
@@ -2532,8 +2581,7 @@ do { \
R("RspQ CIDX:", rspq.cidx);
R("RspQ Gen:", rspq.gen);
S3("u", "Intr delay:", qtimer_val(adap, [i].rspq));
-   S3("u", "Intr pktcnt:",
-  adap->sge.counter_val[rx[i].rspq.pktcnt_idx]);
+   S3("u", "Intr pktcnt:", s->counter_val[rx[i].rspq.pktcnt_idx]);
R("FL ID:", fl.cntxt_id);
R("FL size:", fl.size - 8);
R("FL pend:", fl.pend_cred);
@@ -2558,9 +2606,196 @@ do { \
RL("FLLow:", fl.low);
RL("FLStarving:", fl.starving);
 
-   } else if (ctrl_idx < ctrl_entries) {
-   const struct sge_ctrl_txq *tx = >sge.ctrlq[ctrl_idx * 4];
-   int n = min(4, adap->params.nports - 4 * ctrl_idx);
+   goto unlock;
+   }
+
+   r -= eth_entries;
+   if (r < uld_txq_entries[CXGB4_TX_OFLD]) {
+   const struct sge_uld_txq *tx;
+
+   utxq_info = s->uld_txq_info[CXGB4_TX_OFLD];
+   tx = _info->uldtxq[r * 4];
+   n = min(4, utxq_info->ntxq - 4 * r);
+
+   S("QType:", "OFLD-TXQ");
+   T("TxQ ID:", q.cntxt_id);
+   T("TxQ size:", q.size);
+   T("TxQ inuse:", q.in_use);
+   T("TxQ CIDX:", q.cidx);
+   T("TxQ PIDX:", q.pidx);
+
+   goto 

[PATCH net-next 4/4] l2tp: drop ->mru from struct l2tp_session

2018-07-27 Thread Guillaume Nault
This field is not used.

Treat PPPIOC*MRU the same way as PPPIOC*FLAGS: "get" requests return 0,
while "set" requests vadidate the user supplied pointer but discard its
value.

Signed-off-by: Guillaume Nault 
---
 include/uapi/linux/l2tp.h |  2 +-
 net/l2tp/l2tp_core.c  |  1 -
 net/l2tp/l2tp_core.h  |  2 --
 net/l2tp/l2tp_debugfs.c   |  4 ++--
 net/l2tp/l2tp_netlink.c   | 10 +-
 net/l2tp/l2tp_ppp.c   | 41 +--
 6 files changed, 9 insertions(+), 51 deletions(-)

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 41bf79a4b165..8bb8c7cfabe5 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -120,7 +120,7 @@ enum {
L2TP_ATTR_UDP_SPORT,/* u16 */
L2TP_ATTR_UDP_DPORT,/* u16 */
L2TP_ATTR_MTU,  /* u16 */
-   L2TP_ATTR_MRU,  /* u16 */
+   L2TP_ATTR_MRU,  /* u16 (not used) */
L2TP_ATTR_STATS,/* nested */
L2TP_ATTR_IP6_SADDR,/* struct in6_addr */
L2TP_ATTR_IP6_DADDR,/* struct in6_addr */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index d10f4ed52d92..c61a467fd9b8 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1675,7 +1675,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, 
struct l2tp_tunnel *tunn
session->pwtype = cfg->pw_type;
session->debug = cfg->debug;
session->mtu = cfg->mtu;
-   session->mru = cfg->mru;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 49fd5e05538c..fa5ae9432d38 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -62,7 +62,6 @@ struct l2tp_session_cfg {
int reorder_timeout; /* configured reorder timeout
  * (in jiffies) */
int mtu;
-   int mru;
char*ifname;
 };
 
@@ -107,7 +106,6 @@ struct l2tp_session {
  * (in jiffies) */
int reorder_skip;   /* set if skip to next nr */
int mtu;
-   int mru;
enum l2tp_pwtypepwtype;
struct l2tp_stats   stats;
struct hlist_node   global_hlist;   /* Global hash list node */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 91b9248610f0..aee271741f5b 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,8 +191,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, 
void *v)
if (session->send_seq || session->recv_seq)
seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
seq_printf(m, "   refcnt %d\n", refcount_read(>ref_count));
-   seq_printf(m, "   config %d/%d/%c/%c/-/%s %08x %u\n",
-  session->mtu, session->mru,
+   seq_printf(m, "   config %d/0/%c/%c/-/%s %08x %u\n",
+  session->mtu,
   session->recv_seq ? 'R' : '-',
   session->send_seq ? 'S' : '-',
   session->lns_mode ? "LNS" : "LAC",
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 8ea1deefbc37..a7c409215336 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -611,9 +611,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, 
struct genl_info *inf
if (info->attrs[L2TP_ATTR_MTU])
cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
 
-   if (info->attrs[L2TP_ATTR_MRU])
-   cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
 #ifdef CONFIG_MODULES
if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
genl_unlock();
@@ -704,9 +701,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, 
struct genl_info *inf
if (info->attrs[L2TP_ATTR_MTU])
session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
 
-   if (info->attrs[L2TP_ATTR_MRU])
-   session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
ret = l2tp_session_notify(_nl_family, info,
  session, L2TP_CMD_SESSION_MODIFY);
 
@@ -737,9 +731,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 
portid, u32 seq, int fl
session->peer_session_id) ||
nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
-   nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
-   (session->mru &&
-nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
+   nla_put_u16(skb, 

[PATCH net-next 0/4] l2tp: remove unused session fields

2018-07-27 Thread Guillaume Nault
Several fields of the session structures can be set, but remain unused
otherwise.
This series removes these fields and explicitely ignores the associated
ioctls and netlink attributes.

Guillaume Nault (4):
  l2tp: ignore L2TP_ATTR_DATA_SEQ netlink attribute
  l2tp: ignore L2TP_ATTR_VLAN_ID netlink attribute
  l2tp: drop ->flags from struct pppol2tp_session
  l2tp: drop ->mru from struct l2tp_session

 include/uapi/linux/l2tp.h | 13 +-
 net/l2tp/l2tp_core.c  |  1 -
 net/l2tp/l2tp_core.h  | 11 
 net/l2tp/l2tp_debugfs.c   |  6 ++---
 net/l2tp/l2tp_netlink.c   | 19 +-
 net/l2tp/l2tp_ppp.c   | 54 +--
 6 files changed, 17 insertions(+), 87 deletions(-)

-- 
2.18.0



[PATCH net-next 3/4] l2tp: drop ->flags from struct pppol2tp_session

2018-07-27 Thread Guillaume Nault
This field is not used.

Keep validating user input in PPPIOCSFLAGS. Even though we discard the
value, it would look wrong to succeed if an invalid address was passed
from userspace.

Signed-off-by: Guillaume Nault 
---
 net/l2tp/l2tp_ppp.c | 13 ++---
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 000c9829304c..759ce8421269 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -127,8 +127,6 @@ struct pppol2tp_session {
 * PPPoX socket */
struct sock *__sk;  /* Copy of .sk, for cleanup */
struct rcu_head rcu;/* For asynchronous release */
-   int flags;  /* accessed by PPPIOCGFLAGS.
-* Unused. */
 };
 
 static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
@@ -1057,7 +1055,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session 
*session,
int err = 0;
struct sock *sk;
int val = (int) arg;
-   struct pppol2tp_session *ps = l2tp_session_priv(session);
struct l2tp_tunnel *tunnel = session->tunnel;
struct pppol2tp_ioc_stats stats;
 
@@ -1134,21 +1131,15 @@ static int pppol2tp_session_ioctl(struct l2tp_session 
*session,
 
case PPPIOCGFLAGS:
err = -EFAULT;
-   if (put_user(ps->flags, (int __user *) arg))
+   if (put_user(0, (int __user *)arg))
break;
-
-   l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
- session->name, ps->flags);
err = 0;
break;
 
case PPPIOCSFLAGS:
err = -EFAULT;
-   if (get_user(val, (int __user *) arg))
+   if (get_user(val, (int __user *)arg))
break;
-   ps->flags = val;
-   l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
- session->name, ps->flags);
err = 0;
break;
 
-- 
2.18.0



[PATCH net-next 1/4] l2tp: ignore L2TP_ATTR_DATA_SEQ netlink attribute

2018-07-27 Thread Guillaume Nault
The value of this attribute is never used.

Signed-off-by: Guillaume Nault 
---
 include/uapi/linux/l2tp.h | 7 ---
 net/l2tp/l2tp_core.h  | 8 
 net/l2tp/l2tp_debugfs.c   | 4 +---
 net/l2tp/l2tp_netlink.c   | 6 --
 4 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 7d570c7bd117..ae888606b3ec 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -65,9 +65,9 @@ struct sockaddr_l2tpip6 {
  * TUNNEL_MODIFY   - CONN_ID, udpcsum
  * TUNNEL_GETSTATS - CONN_ID, (stats)
  * TUNNEL_GET  - CONN_ID, (...)
- * SESSION_CREATE  - SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, 
l2spec
+ * SESSION_CREATE  - SESSION_ID, PW_TYPE, cookie, peer_cookie, l2spec
  * SESSION_DELETE  - SESSION_ID
- * SESSION_MODIFY  - SESSION_ID, data_seq
+ * SESSION_MODIFY  - SESSION_ID
  * SESSION_GET - SESSION_ID, (...)
  * SESSION_GETSTATS- SESSION_ID, (stats)
  *
@@ -95,7 +95,7 @@ enum {
L2TP_ATTR_PW_TYPE,  /* u16, enum l2tp_pwtype */
L2TP_ATTR_ENCAP_TYPE,   /* u16, enum l2tp_encap_type */
L2TP_ATTR_OFFSET,   /* u16 (not used) */
-   L2TP_ATTR_DATA_SEQ, /* u16 */
+   L2TP_ATTR_DATA_SEQ, /* u16 (not used) */
L2TP_ATTR_L2SPEC_TYPE,  /* u8, enum l2tp_l2spec_type */
L2TP_ATTR_L2SPEC_LEN,   /* u8 (not used) */
L2TP_ATTR_PROTO_VERSION,/* u8 */
@@ -169,6 +169,7 @@ enum l2tp_encap_type {
L2TP_ENCAPTYPE_IP,
 };
 
+/* For L2TP_ATTR_DATA_SEQ. Unused. */
 enum l2tp_seqmode {
L2TP_SEQ_NONE = 0,
L2TP_SEQ_IP = 1,
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index d85fde793a8c..7dbfb55ab3b5 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -45,10 +45,6 @@ struct l2tp_tunnel;
  */
 struct l2tp_session_cfg {
enum l2tp_pwtypepw_type;
-   unsigned intdata_seq:2; /* data sequencing level
-* 0 => none, 1 => IP only,
-* 2 => all
-*/
unsigned intrecv_seq:1; /* expect receive packets with
 * sequence numbers? */
unsigned intsend_seq:1; /* send packets with sequence
@@ -99,10 +95,6 @@ struct l2tp_session {
 
charname[32];   /* for logging */
charifname[IFNAMSIZ];
-   unsigned intdata_seq:2; /* data sequencing level
-* 0 => none, 1 => IP only,
-* 2 => all
-*/
unsigned intrecv_seq:1; /* expect receive packets with
 * sequence numbers? */
unsigned intsend_seq:1; /* send packets with sequence
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index b5d7dde003ef..91b9248610f0 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,12 +191,10 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, 
void *v)
if (session->send_seq || session->recv_seq)
seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
seq_printf(m, "   refcnt %d\n", refcount_read(>ref_count));
-   seq_printf(m, "   config %d/%d/%c/%c/%s/%s %08x %u\n",
+   seq_printf(m, "   config %d/%d/%c/%c/-/%s %08x %u\n",
   session->mtu, session->mru,
   session->recv_seq ? 'R' : '-',
   session->send_seq ? 'S' : '-',
-  session->data_seq == 1 ? "IPSEQ" :
-  session->data_seq == 2 ? "DATASEQ" : "-",
   session->lns_mode ? "LNS" : "LAC",
   session->debug,
   jiffies_to_msecs(session->reorder_timeout));
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 5b9900889e31..e4785f6966f6 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, 
struct genl_info *inf
}
 
if (tunnel->version > 2) {
-   if (info->attrs[L2TP_ATTR_DATA_SEQ])
-   cfg.data_seq = 
nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
cfg.l2specific_type = 
nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
@@ -693,9 +690,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, 
struct genl_info *inf
if (info->attrs[L2TP_ATTR_DEBUG])
session->debug = 

[PATCH net-next 2/4] l2tp: ignore L2TP_ATTR_VLAN_ID netlink attribute

2018-07-27 Thread Guillaume Nault
The value of this attribute is never used.

Signed-off-by: Guillaume Nault 
---
 include/uapi/linux/l2tp.h | 4 ++--
 net/l2tp/l2tp_core.h  | 1 -
 net/l2tp/l2tp_netlink.c   | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index ae888606b3ec..41bf79a4b165 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -60,7 +60,7 @@ struct sockaddr_l2tpip6 {
 /*
  * Commands.
  * Valid TLVs of each command are:-
- * TUNNEL_CREATE   - CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, 
udpcsum, vlanid
+ * TUNNEL_CREATE   - CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, 
udpcsum
  * TUNNEL_DELETE   - CONN_ID
  * TUNNEL_MODIFY   - CONN_ID, udpcsum
  * TUNNEL_GETSTATS - CONN_ID, (stats)
@@ -105,7 +105,7 @@ enum {
L2TP_ATTR_SESSION_ID,   /* u32 */
L2TP_ATTR_PEER_SESSION_ID,  /* u32 */
L2TP_ATTR_UDP_CSUM, /* u8 */
-   L2TP_ATTR_VLAN_ID,  /* u16 */
+   L2TP_ATTR_VLAN_ID,  /* u16 (not used) */
L2TP_ATTR_COOKIE,   /* 0, 4 or 8 bytes */
L2TP_ATTR_PEER_COOKIE,  /* 0, 4 or 8 bytes */
L2TP_ATTR_DEBUG,/* u32, enum l2tp_debug_flags */
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 7dbfb55ab3b5..49fd5e05538c 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -54,7 +54,6 @@ struct l2tp_session_cfg {
 * control of LNS. */
int debug;  /* bitmask of debug message
 * categories */
-   u16 vlan_id;/* VLAN pseudowire only */
u16 l2specific_type; /* Layer 2 specific type */
u8  cookie[8];  /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e4785f6966f6..8ea1deefbc37 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -591,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, 
struct genl_info *inf
}
if (info->attrs[L2TP_ATTR_IFNAME])
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
-
-   if (info->attrs[L2TP_ATTR_VLAN_ID])
-   cfg.vlan_id = 
nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
}
 
if (info->attrs[L2TP_ATTR_DEBUG])
-- 
2.18.0



[PATCHv4 net-next 1/2] route: add support for directed broadcast forwarding

2018-07-27 Thread Xin Long
This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl bc_forwarding is enabled.

Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
  - target TEE's gateway param has to be set with a specific address,
and it's not flexible especially when the route wants forward all
directed broadcasts.
  - this duplicates the directed broadcasts so this may cause side
effects to applications.

Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.

Note that route cache needs to be flushed when bc_forwarding is
changed.

Signed-off-by: Xin Long 
---
 include/linux/inetdevice.h   |  1 +
 include/uapi/linux/ip.h  |  1 +
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c   | 11 +++
 net/ipv4/route.c |  6 +-
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1..c759d1c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device 
*in_dev)
 
 #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
 #define IN_DEV_MFORWARD(in_dev)IN_DEV_ANDCONF((in_dev), 
MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev)IN_DEV_ANDCONF((in_dev), 
BC_FORWARDING)
 #define IN_DEV_RPFILTER(in_dev)IN_DEV_MAXCONF((in_dev), 
RP_FILTER)
 #define IN_DEV_SRC_VMARK(in_dev)   IN_DEV_ORCONF((in_dev), SRC_VMARK)
 #define IN_DEV_SOURCE_ROUTE(in_dev)IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742..e42d13b 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -168,6 +168,7 @@ enum
IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
+   IPV4_DEVCONF_BC_FORWARDING,
__IPV4_DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdf..fac4edd 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -18,6 +18,7 @@ enum {
NETCONFA_PROXY_NEIGH,
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_INPUT,
+   NETCONFA_BC_FORWARDING,
__NETCONFA_MAX
 };
 #define NETCONFA_MAX   (__NETCONFA_MAX - 1)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab..ea4bd8a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+   if (all || type == NETCONFA_BC_FORWARDING)
+   size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff 
*skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+   if ((all || type == NETCONFA_BC_FORWARDING) &&
+   nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+   IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+   goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int 
write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
 
+   if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+   new_value != old_value)
+   rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
 devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+   DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
 
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97..b678466 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 

[PATCHv4 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-27 Thread Xin Long
As Ido's suggestion, this patch is to add a selftest for directed
broadcast forwarding with vrf. It does the assertion by checking
the src IP of the echo-reply packet in ping_test_from.

Signed-off-by: Xin Long 
---
 .../selftests/net/forwarding/router_broadcast.sh   | 233 +
 1 file changed, 233 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/router_broadcast.sh

diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh 
b/tools/testing/selftests/net/forwarding/router_broadcast.sh
new file mode 100755
index 000..7bd2ebb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+   vrf_create "vrf-h1"
+   ip link set dev $h1 master vrf-h1
+
+   ip link set dev vrf-h1 up
+   ip link set dev $h1 up
+
+   ip address add 192.0.2.2/24 dev $h1
+
+   ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+   ip route add 198.51.200.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+}
+
+h1_destroy()
+{
+   ip route del 198.51.200.0/24 vrf vrf-h1
+   ip route del 198.51.100.0/24 vrf vrf-h1
+
+   ip address del 192.0.2.2/24 dev $h1
+
+   ip link set dev $h1 down
+   vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+   vrf_create "vrf-h2"
+   ip link set dev $h2 master vrf-h2
+
+   ip link set dev vrf-h2 up
+   ip link set dev $h2 up
+
+   ip address add 198.51.100.2/24 dev $h2
+
+   ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+   ip route add 198.51.200.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+}
+
+h2_destroy()
+{
+   ip route del 198.51.200.0/24 vrf vrf-h2
+   ip route del 192.0.2.0/24 vrf vrf-h2
+
+   ip address del 198.51.100.2/24 dev $h2
+
+   ip link set dev $h2 down
+   vrf_destroy "vrf-h2"
+}
+
+h3_create()
+{
+   vrf_create "vrf-h3"
+   ip link set dev $h3 master vrf-h3
+
+   ip link set dev vrf-h3 up
+   ip link set dev $h3 up
+
+   ip address add 198.51.200.2/24 dev $h3
+
+   ip route add 192.0.2.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+   ip route add 198.51.100.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+}
+
+h3_destroy()
+{
+   ip route del 198.51.100.0/24 vrf vrf-h3
+   ip route del 192.0.2.0/24 vrf vrf-h3
+
+   ip address del 198.51.200.2/24 dev $h3
+
+   ip link set dev $h3 down
+   vrf_destroy "vrf-h3"
+}
+
+router_create()
+{
+   ip link set dev $rp1 up
+   ip link set dev $rp2 up
+   ip link set dev $rp3 up
+
+   ip address add 192.0.2.1/24 dev $rp1
+
+   ip address add 198.51.100.1/24 dev $rp2
+   ip address add 198.51.200.1/24 dev $rp3
+}
+
+router_destroy()
+{
+   ip address del 198.51.200.1/24 dev $rp3
+   ip address del 198.51.100.1/24 dev $rp2
+
+   ip address del 192.0.2.1/24 dev $rp1
+
+   ip link set dev $rp3 down
+   ip link set dev $rp2 down
+   ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+   h1=${NETIFS[p1]}
+   rp1=${NETIFS[p2]}
+
+   rp2=${NETIFS[p3]}
+   h2=${NETIFS[p4]}
+
+   rp3=${NETIFS[p5]}
+   h3=${NETIFS[p6]}
+
+   vrf_prepare
+
+   h1_create
+   h2_create
+   h3_create
+
+   router_create
+
+   forwarding_enable
+}
+
+cleanup()
+{
+   pre_cleanup
+
+   forwarding_restore
+
+   router_destroy
+
+   h3_destroy
+   h2_destroy
+   h1_destroy
+
+   vrf_cleanup
+}
+
+bc_forwarding_disable()
+{
+   sysctl_set net.ipv4.conf.all.bc_forwarding 0
+   sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0
+}
+
+bc_forwarding_enable()
+{
+   sysctl_set net.ipv4.conf.all.bc_forwarding 1
+   sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1
+}
+
+bc_forwarding_restore()
+{
+   sysctl_restore net.ipv4.conf.$rp1.bc_forwarding
+   sysctl_restore net.ipv4.conf.all.bc_forwarding
+}
+
+ping_test_from()
+{
+   local oif=$1
+   local dip=$2
+   local from=$3
+   local fail=${4:-0}
+
+   RET=0
+
+   log_info "ping $dip, expected reply from $from"
+   ip vrf exec $(master_name_get $oif) \
+   $PING -I $oif $dip -c 10 -i 0.1 -w 2 -b 2>&1 | grep $from &> /dev/null
+   check_err_fail $fail $?
+}
+
+ping_ipv4()
+{
+   sysctl_set net.ipv4.icmp_echo_ignore_broadcasts 0
+
+   bc_forwarding_disable
+   log_info "bc_forwarding disabled on r1 =>"
+   ping_test_from $h1 198.51.100.255 192.0.2.1
+   log_test "h1 -> net2: reply from r1 (not forwarding)"
+   ping_test_from $h1 198.51.200.255 192.0.2.1
+   log_test "h1 -> net3: reply from r1 (not forwarding)"
+   ping_test_from $h1 192.0.2.255 192.0.2.1
+   log_test "h1 -> net1: reply from r1 (not dropping)"
+   ping_test_from $h1 255.255.255.255 192.0.2.1
+   log_test "h1 -> 255.255.255.255: reply from r1 (not forwarding)"
+
+   ping_test_from $h2 192.0.2.255 198.51.100.1
+   

[PATCHv4 net-next 0/2] route: add support and selftests for directed broadcast forwarding

2018-07-27 Thread Xin Long
Patch 1/2 is the feature and 2/2 is the selftest. Check the changelog
on each of them to know the details.

v1->v2:
  - fix a typo in changelog.
  - fix an uapi break that Davide noticed.
  - flush route cache when bc_forwarding is changed.
  - add the selftest for this patch as Ido's suggestion.

v2->v3:
  - fix an incorrect 'if check' in devinet_conf_proc as David Ahern
noticed.
  - extend the selftest after one David Ahern fix for vrf.

v3->v4:
  - improve the output log in the selftest as David Ahern suggested.

Xin Long (2):
  route: add support for directed broadcast forwarding
  selftests: add a selftest for directed broadcast forwarding

 include/linux/inetdevice.h |   1 +
 include/uapi/linux/ip.h|   1 +
 include/uapi/linux/netconf.h   |   1 +
 net/ipv4/devinet.c |  11 +
 net/ipv4/route.c   |   6 +-
 .../selftests/net/forwarding/router_broadcast.sh   | 233 +
 6 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/forwarding/router_broadcast.sh

-- 
2.1.0



[PATCH][net-next] openvswitch: eliminate cpu_used_mask from sw_flow

2018-07-27 Thread Li RongQing
The size of struct cpumask varies with CONFIG_NR_CPUS, some config
CONFIG_NR_CPUS is very larger, like 5120, struct cpumask will take
640 bytes, if there is thousands of flows, it will take lots of
memory

cpu_used_mask has two purposes
1: Assume first cpu as cpu0 which maybe not true; now use
   cpumask_first(cpu_possible_mask)
2: when get/clear statistic, reduce the iteratation; but it
   is not hot path, so use for_each_possible_cpu

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/openvswitch/flow.c   | 11 +--
 net/openvswitch/flow.h   |  5 ++---
 net/openvswitch/flow_table.c | 11 +--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 56b8e7167790..ad580bec00fb 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -85,7 +85,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
flow->stats_last_writer = cpu;
} else {
-   stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+   int cpu1 = cpumask_first(cpu_possible_mask);
+
+   stats = rcu_dereference(flow->stats[cpu1]); /* Pre-allocated. */
spin_lock(>lock);
 
/* If the current CPU is the only writer on the
@@ -118,7 +120,6 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 
rcu_assign_pointer(flow->stats[cpu],
   new_stats);
-   cpumask_set_cpu(cpu, 
>cpu_used_mask);
goto unlock;
}
}
@@ -145,8 +146,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-   /* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
+   for_each_possible_cpu(cpu) {
struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
@@ -169,8 +169,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
 {
int cpu;
 
-   /* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
+   for_each_possible_cpu(cpu) {
struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
if (stats) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8b7..d0ea5d6ced3e 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -223,17 +223,16 @@ struct sw_flow {
u32 hash;
} flow_table, ufid_table;
int stats_last_writer;  /* CPU id of the last writer on
-* 'stats[0]'.
+* 'stats[first cpu id]'.
 */
struct sw_flow_key key;
struct sw_flow_id id;
-   struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
   * is allocated at flow creation time,
   * the rest are allocated on demand
-  * while holding the 'stats[0].lock'.
+  * while holding the 'stats[first cpu 
id].lock'
   */
 };
 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a71852e..e4dbd65c308a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -80,6 +80,7 @@ struct sw_flow *ovs_flow_alloc(void)
 {
struct sw_flow *flow;
struct flow_stats *stats;
+   int cpu = cpumask_first(cpu_possible_mask);
 
flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
if (!flow)
@@ -90,15 +91,13 @@ struct sw_flow *ovs_flow_alloc(void)
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
  GFP_KERNEL | __GFP_ZERO,
- node_online(0) ? 0 : NUMA_NO_NODE);
+ cpu_to_node(cpu));
if (!stats)
goto err;
 
spin_lock_init(>lock);
 
-   RCU_INIT_POINTER(flow->stats[0], stats);
-
-   cpumask_set_cpu(0, >cpu_used_mask);
+   RCU_INIT_POINTER(flow->stats[cpu], stats);
 
return flow;
 err:
@@ -142,11 +141,11 @@ static void flow_free(struct sw_flow *flow)
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
  

[patch net-next] net: sched: don't dump chains only held by actions

2018-07-27 Thread Jiri Pirko
From: Jiri Pirko 

In case a chain is empty and not explicitly created by a user,
such chain should not exist. The only exception is if there is
an action "goto chain" pointing to it. In that case, don't show the
chain in the dump. Track the chain references held by actions and
use them to find out if a chain should or should not be shown
in chain dump.

Signed-off-by: Jiri Pirko 
---
rfc->v1:
- fixed get/del paths to also ignore zombie chains
---
 include/net/pkt_cls.h |  3 ++
 include/net/sch_generic.h |  1 +
 net/sched/act_api.c   |  4 +--
 net/sched/cls_api.c   | 70 +++
 4 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a3101582f642..6d02f31abba8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t 
func);
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
bool create);
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block,
+  u32 chain_index);
 void tcf_chain_put(struct tcf_chain *chain);
+void tcf_chain_put_by_act(struct tcf_chain *chain);
 void tcf_block_netif_keep_dst(struct tcf_block *block);
 int tcf_block_get(struct tcf_block **p_block,
  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 085c509c8674..c5432362dc26 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -314,6 +314,7 @@ struct tcf_chain {
struct tcf_block *block;
u32 index; /* chain index */
unsigned int refcnt;
+   unsigned int action_refcnt;
bool explicitly_created;
const struct tcf_proto_ops *tmplt_ops;
void *tmplt_priv;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 148a89ab789b..b43df1e25c6d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, 
struct tcf_proto *tp)
 
if (!tp)
return -EINVAL;
-   a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true);
+   a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
if (!a->goto_chain)
return -ENOMEM;
return 0;
@@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, 
struct tcf_proto *tp)
 
 static void tcf_action_goto_chain_fini(struct tc_action *a)
 {
-   tcf_chain_put(a->goto_chain);
+   tcf_chain_put_by_act(a->goto_chain);
 }
 
 static void tcf_action_goto_chain_exec(const struct tc_action *a,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 75cce2819de9..e20aad1987b8 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain)
++chain->refcnt;
 }
 
+static void tcf_chain_hold_by_act(struct tcf_chain *chain)
+{
+   ++chain->action_refcnt;
+}
+
+static void tcf_chain_release_by_act(struct tcf_chain *chain)
+{
+   --chain->action_refcnt;
+}
+
+static bool tcf_chain_is_zombie(struct tcf_chain *chain)
+{
+   /* In case all the references are action references, this
+* chain is a zombie and should not be listed in the chain
+* dump list.
+*/
+   return chain->refcnt == chain->action_refcnt;
+}
+
 static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
  u32 chain_index)
 {
@@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, 
u32 chain_index,
 }
 EXPORT_SYMBOL(tcf_chain_get);
 
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 
chain_index)
+{
+   struct tcf_chain *chain = tcf_chain_get(block, chain_index, true);
+
+   tcf_chain_hold_by_act(chain);
+   return chain;
+}
+EXPORT_SYMBOL(tcf_chain_get_by_act);
+
 static void tc_chain_tmplt_del(struct tcf_chain *chain);
 
 void tcf_chain_put(struct tcf_chain *chain)
@@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain)
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
+void tcf_chain_put_by_act(struct tcf_chain *chain)
+{
+   tcf_chain_release_by_act(chain);
+   tcf_chain_put(chain);
+}
+EXPORT_SYMBOL(tcf_chain_put_by_act);
+
 static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
 {
if (chain->explicitly_created)
@@ -1803,20 +1838,29 @@ static int tc_ctl_chain(struct sk_buff *skb, struct 
nlmsghdr *n,
chain = tcf_chain_lookup(block, chain_index);
if (n->nlmsg_type == RTM_NEWCHAIN) {
if (chain) {
-   NL_SET_ERR_MSG(extack, "Filter chain already exists");
-   return -EEXIST;
-   }
-   if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-   NL_SET_ERR_MSG(extack, "Need both 

Re: [PATCH RFC ipsec-next] xfrm: Check Reverse-Mark Lookup Before ADDSA/DELSA

2018-07-27 Thread Steffen Klassert
On Wed, Jul 25, 2018 at 03:36:47PM -0700, Nathan Harold wrote:
> 
> diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> index b669262682c9..ee212a7c91a9 100644
> --- a/net/xfrm/xfrm_state.c
> +++ b/net/xfrm/xfrm_state.c
> @@ -815,10 +815,10 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct 
> flowi *fl,
>   afinfo->init_temprop(x, tmpl, daddr, saddr);
>  }
>  
> -static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
> -   const xfrm_address_t *daddr,
> -   __be32 spi, u8 proto,
> -   unsigned short family)
> +static struct xfrm_state *
> +__xfrm_state_lookup(struct net *net, u32 mark, u32 mask,
> + const xfrm_address_t *daddr,
> + __be32 spi, u8 proto, unsigned short family)

The argument list of these functions are getting longer and longer.
Can't you just put in a pointer to struct xfrm_mark and dereference
inside the function?

Looks good otherwise.


[PATCH 07/14] xfrm: use time64_t for in-kernel timestamps

2018-07-27 Thread Steffen Klassert
From: Arnd Bergmann 

The lifetime managment uses '__u64' timestamps on the user space
interface, but 'unsigned long' for reading the current time in the kernel
with get_seconds().

While this is probably safe beyond y2038, it will still overflow in 2106,
and the get_seconds() call is deprecated because fo that.

This changes the xfrm time handling to use time64_t consistently, along
with reading the time using the safer ktime_get_real_seconds(). It still
suffers from problems that can happen from a concurrent settimeofday()
call or (to a lesser degree) a leap second update, but since the time
stamps are part of the user API, there is nothing we can do to prevent
that.

Signed-off-by: Arnd Bergmann 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_policy.c | 24 
 net/xfrm/xfrm_state.c  | 10 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ef75891450e7..5d2f734f4309 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -189,8 +189,8 @@ static inline unsigned long make_jiffies(long secs)
 static void xfrm_policy_timer(struct timer_list *t)
 {
struct xfrm_policy *xp = from_timer(xp, t, timer);
-   unsigned long now = get_seconds();
-   long next = LONG_MAX;
+   time64_t now = ktime_get_real_seconds();
+   time64_t next = TIME64_MAX;
int warn = 0;
int dir;
 
@@ -202,7 +202,7 @@ static void xfrm_policy_timer(struct timer_list *t)
dir = xfrm_policy_id2dir(xp->index);
 
if (xp->lft.hard_add_expires_seconds) {
-   long tmo = xp->lft.hard_add_expires_seconds +
+   time64_t tmo = xp->lft.hard_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0)
goto expired;
@@ -210,7 +210,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.hard_use_expires_seconds) {
-   long tmo = xp->lft.hard_use_expires_seconds +
+   time64_t tmo = xp->lft.hard_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0)
goto expired;
@@ -218,7 +218,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_add_expires_seconds) {
-   long tmo = xp->lft.soft_add_expires_seconds +
+   time64_t tmo = xp->lft.soft_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0) {
warn = 1;
@@ -228,7 +228,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_use_expires_seconds) {
-   long tmo = xp->lft.soft_use_expires_seconds +
+   time64_t tmo = xp->lft.soft_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0) {
warn = 1;
@@ -240,7 +240,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 
if (warn)
km_policy_expired(xp, dir, 0, 0);
-   if (next != LONG_MAX &&
+   if (next != TIME64_MAX &&
!mod_timer(>timer, jiffies + make_jiffies(next)))
xfrm_pol_hold(xp);
 
@@ -791,7 +791,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, 
int excl)
}
policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, 
policy->index);
hlist_add_head(>byidx, net->xfrm.policy_byidx+idx_hash(net, 
policy->index));
-   policy->curlft.add_time = get_seconds();
+   policy->curlft.add_time = ktime_get_real_seconds();
policy->curlft.use_time = 0;
if (!mod_timer(>timer, jiffies + HZ))
xfrm_pol_hold(policy);
@@ -1282,7 +1282,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, 
struct xfrm_policy *pol)
old_pol = rcu_dereference_protected(sk->sk_policy[dir],
lockdep_is_held(>xfrm.xfrm_policy_lock));
if (pol) {
-   pol->curlft.add_time = get_seconds();
+   pol->curlft.add_time = ktime_get_real_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
@@ -2132,7 +2132,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct 
dst_entry *dst_orig,
}
 
for (i = 0; i < num_pols; i++)
-   pols[i]->curlft.use_time = get_seconds();
+   pols[i]->curlft.use_time = ktime_get_real_seconds();
 
if (num_xfrms < 0) {
/* Prohibit the flow */
@@ -2352,7 +2352,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct 
sk_buff *skb,
return 1;
}
 
-   pol->curlft.use_time = get_seconds();
+   

[PATCH 01/14] xfrm: Extend the output_mark to support input direction and masking.

2018-07-27 Thread Steffen Klassert
We already support setting an output mark at the xfrm_state,
unfortunately this does not support the input direction and
masking the marks that will be applied to the skb. This change
adds support applying a masked value in both directions.

The existing XFRMA_OUTPUT_MARK number is reused for this purpose
and as it is now bi-directional, it is renamed to XFRMA_SET_MARK.

An additional XFRMA_SET_MARK_MASK attribute is added for setting the
mask. If the attribute mask not provided, it is set to 0x,
keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics.

Co-developed-by: Tobias Brunner 
Co-developed-by: Eyal Birger 
Co-developed-by: Lorenzo Colitti 
Signed-off-by: Steffen Klassert 
Signed-off-by: Tobias Brunner 
Signed-off-by: Eyal Birger 
Signed-off-by: Lorenzo Colitti 
---
 include/net/xfrm.h|  9 -
 include/uapi/linux/xfrm.h |  4 +++-
 net/xfrm/xfrm_device.c|  3 ++-
 net/xfrm/xfrm_input.c |  2 ++
 net/xfrm/xfrm_output.c|  3 +--
 net/xfrm/xfrm_policy.c|  5 +++--
 net/xfrm/xfrm_user.c  | 48 +--
 7 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 557122846e0e..3dc83ba26f62 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -166,7 +166,7 @@ struct xfrm_state {
int header_len;
int trailer_len;
u32 extra_flags;
-   u32 output_mark;
+   struct xfrm_marksmark;
} props;
 
struct xfrm_lifetime_cfg lft;
@@ -2012,6 +2012,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, 
const struct xfrm_mark *m)
return ret;
 }
 
+static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
+{
+   struct xfrm_mark *m = >props.smark;
+
+   return (m->v & m->m) | (mark & ~m->m);
+}
+
 static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
unsigned int family)
 {
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index e3af2859188b..5a6ed7ce5a29 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -305,9 +305,11 @@ enum xfrm_attr_type_t {
XFRMA_ADDRESS_FILTER,   /* struct xfrm_address_filter */
XFRMA_PAD,
XFRMA_OFFLOAD_DEV,  /* struct xfrm_state_offload */
-   XFRMA_OUTPUT_MARK,  /* __u32 */
+   XFRMA_SET_MARK, /* __u32 */
+   XFRMA_SET_MARK_MASK,/* __u32 */
__XFRMA_MAX
 
+#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK   /* Compatibility */
 #define XFRMA_MAX (__XFRMA_MAX - 1)
 };
 
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..16c1230d20fa 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state 
*x,
}
 
dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
-   x->props.family, x->props.output_mark);
+   x->props.family,
+   xfrm_smark_get(0, x));
if (IS_ERR(dst))
return 0;
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 352abca2605f..074810436242 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -339,6 +339,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
goto drop;
}
 
+   skb->mark = xfrm_smark_get(skb->mark, x);
+
skb->sp->xvec[skb->sp->len++] = x;
 
 lock:
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..45ba07ab3e4f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
goto error_nolock;
}
 
-   if (x->props.output_mark)
-   skb->mark = x->props.output_mark;
+   skb->mark = xfrm_smark_get(skb->mark, x);
 
err = x->outer_mode->output(x, skb);
if (err) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..7637637717ec 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1607,10 +1607,11 @@ static struct dst_entry *xfrm_bundle_create(struct 
xfrm_policy *policy,
dst_copy_metrics(dst1, dst);
 
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+   __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
- , , family,
- xfrm[i]->props.output_mark);
+  

[PATCH 12/14] xfrm: fix 'passing zero to ERR_PTR()' warning

2018-07-27 Thread Steffen Klassert
From: YueHaibing 

Fix a static code checker warning:

  net/xfrm/xfrm_policy.c:1836 xfrm_resolve_and_create_bundle() warn: passing 
zero to 'ERR_PTR'

xfrm_tmpl_resolve return 0 just means no xdst found, return NULL
instead of passing zero to ERR_PTR.

Fixes: d809ec895505 ("xfrm: do not assume that template resolving always 
returns xfrms")
Signed-off-by: YueHaibing 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_policy.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2f70fe68b9b0..69f06f879091 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1752,7 +1752,10 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy 
**pols, int num_pols,
/* Try to instantiate a bundle */
err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
if (err <= 0) {
-   if (err != 0 && err != -EAGAIN)
+   if (err == 0)
+   return NULL;
+
+   if (err != -EAGAIN)
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
return ERR_PTR(err);
}
-- 
2.14.1



  1   2   >