[PATCH net-next 04/14] nfp: abm: add helpers for configuring queue marking levels

2018-05-25 Thread Jakub Kicinski
Queue levels for simple ECN marking are stored in _abi_nfd_out_q_lvls_X
symbol, where X is the PCIe PF id.  Find out the location of that symbol
and add helpers for modifying it.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 80 +++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  3 +
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  5 ++
 3 files changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c 
b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 676d3afc9bdd..978884a0be19 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -35,12 +35,57 @@
 #include 
 
 #include "../nfpcore/nfp_cpp.h"
+#include "../nfpcore/nfp_nffw.h"
 #include "../nfp_app.h"
 #include "../nfp_abi.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
 #include "main.h"
 
+#define NFP_QLVL_SYM_NAME  "_abi_nfd_out_q_lvls_%u"
+#define NFP_QLVL_STRIDE16
+#define NFP_QLVL_THRS  8
+
+static unsigned long long
+nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int queue)
+{
+   return alink->abm->q_lvls->addr +
+   (alink->queue_base + queue) * NFP_QLVL_STRIDE + NFP_QLVL_THRS;
+}
+
+static int
+nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
+{
+   struct nfp_cpp *cpp = alink->abm->app->cpp;
+   u32 muw;
+   int err;
+
+   muw = NFP_CPP_ATOMIC_WR(alink->abm->q_lvls->target,
+   alink->abm->q_lvls->domain);
+
+   err = nfp_cpp_writel(cpp, muw, nfp_abm_q_lvl_thrs(alink, i), val);
+   if (err) {
+   nfp_err(cpp, "RED offload setting level failed on vNIC %d queue 
%d\n",
+   alink->id, i);
+   return err;
+   }
+
+   return 0;
+}
+
+int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val)
+{
+   int i, err;
+
+   for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+   err = nfp_abm_ctrl_set_q_lvl(alink, i, val);
+   if (err)
+   return err;
+   }
+
+   return 0;
+}
+
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm)
 {
return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_ENABLE,
@@ -59,13 +104,48 @@ void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink)
alink->queue_base /= alink->vnic->stride_rx;
 }
 
+static const struct nfp_rtsym *
+nfp_abm_ctrl_find_rtsym(struct nfp_pf *pf, const char *name, unsigned int size)
+{
+   const struct nfp_rtsym *sym;
+
+   sym = nfp_rtsym_lookup(pf->rtbl, name);
+   if (!sym) {
+   nfp_err(pf->cpp, "Symbol '%s' not found\n", name);
+   return ERR_PTR(-ENOENT);
+   }
+   if (sym->size != size) {
+   nfp_err(pf->cpp,
+   "Symbol '%s' wrong size: expected %u got %llu\n",
+   name, size, sym->size);
+   return ERR_PTR(-EINVAL);
+   }
+
+   return sym;
+}
+
+static const struct nfp_rtsym *
+nfp_abm_ctrl_find_q_rtsym(struct nfp_pf *pf, const char *name,
+ unsigned int size)
+{
+   return nfp_abm_ctrl_find_rtsym(pf, name, size * NFP_NET_MAX_RX_RINGS);
+}
+
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm)
 {
struct nfp_pf *pf = abm->app->pf;
+   const struct nfp_rtsym *sym;
unsigned int pf_id;
+   char pf_symbol[64];
 
pf_id = nfp_cppcore_pcie_unit(pf->cpp);
abm->pf_id = pf_id;
 
+   snprintf(pf_symbol, sizeof(pf_symbol), NFP_QLVL_SYM_NAME, pf_id);
+   sym = nfp_abm_ctrl_find_q_rtsym(pf, pf_symbol, NFP_QLVL_STRIDE);
+   if (IS_ERR(sym))
+   return PTR_ERR(sym);
+   abm->q_lvls = sym;
+
return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h 
b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 7d129b205535..1ac651cdc140 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -49,11 +49,13 @@ struct nfp_net;
  * @pf_id: ID of our PF link
  * @eswitch_mode:  devlink eswitch mode, advanced functions only visible
  * in switchdev mode
+ * @q_lvls:queue level control area
  */
 struct nfp_abm {
struct nfp_app *app;
unsigned int pf_id;
enum devlink_eswitch_mode eswitch_mode;
+   const struct nfp_rtsym *q_lvls;
 };
 
 /**
@@ -72,6 +74,7 @@ struct nfp_abm_link {
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
+int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
 int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h 
b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index 4e19add1c539..b0da3d436850 100644
--- 

[PATCH net-next 02/14] nfp: prefix vNIC phys_port_name with 'n'

2018-05-25 Thread Jakub Kicinski
Some drivers are using a bare number inside phys_port_name
as VF id and OpenStack's regexps will pick it up.  We can't
use a bare number for your vNICs, prefix the names with 'n'.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1f572896d1ee..75110c8d6a90 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3289,7 +3289,7 @@ nfp_net_get_phys_port_name(struct net_device *netdev, 
char *name, size_t len)
if (nn->dp.is_vf)
return -EOPNOTSUPP;
 
-   n = snprintf(name, len, "%d", nn->id);
+   n = snprintf(name, len, "n%d", nn->id);
if (n >= len)
return -EINVAL;
 
-- 
2.17.0



[PATCH net-next 10/14] nfp: abm: expose all PF queues

2018-05-25 Thread Jakub Kicinski
Allocate the PF representor as multi-queue to allow setting
the configuration per-queue.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 10 +++---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |  5 +++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  7 ++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 4e89159f13d3..ef77d7b0d99d 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -255,14 +255,18 @@ nfp_abm_spawn_repr(struct nfp_app *app, struct 
nfp_abm_link *alink,
struct nfp_reprs *reprs;
struct nfp_repr *repr;
struct nfp_port *port;
+   unsigned int txqs;
int err;
 
-   if (ptype == NFP_PORT_PHYS_PORT)
+   if (ptype == NFP_PORT_PHYS_PORT) {
rtype = NFP_REPR_TYPE_PHYS_PORT;
-   else
+   txqs = 1;
+   } else {
rtype = NFP_REPR_TYPE_PF;
+   txqs = alink->vnic->max_rx_rings;
+   }
 
-   netdev = nfp_repr_alloc(app);
+   netdev = nfp_repr_alloc_mqs(app, txqs, 1);
if (!netdev)
return -ENOMEM;
repr = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 117eca6819de..d7b712f6362f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -360,12 +360,13 @@ void nfp_repr_free(struct net_device *netdev)
__nfp_repr_free(netdev_priv(netdev));
 }
 
-struct net_device *nfp_repr_alloc(struct nfp_app *app)
+struct net_device *
+nfp_repr_alloc_mqs(struct nfp_app *app, unsigned int txqs, unsigned int rxqs)
 {
struct net_device *netdev;
struct nfp_repr *repr;
 
-   netdev = alloc_etherdev(sizeof(*repr));
+   netdev = alloc_etherdev_mqs(sizeof(*repr), txqs, rxqs);
if (!netdev)
return NULL;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index 8366e4f3c623..1bf2b18109ab 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -126,7 +126,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device 
*netdev,
  u32 cmsg_port_id, struct nfp_port *port,
  struct net_device *pf_netdev);
 void nfp_repr_free(struct net_device *netdev);
-struct net_device *nfp_repr_alloc(struct nfp_app *app);
+struct net_device *
+nfp_repr_alloc_mqs(struct nfp_app *app, unsigned int txqs, unsigned int rxqs);
 void nfp_repr_clean_and_free(struct nfp_repr *repr);
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs);
 void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
@@ -134,4 +135,8 @@ void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
 struct nfp_reprs *nfp_reprs_alloc(unsigned int num_reprs);
 int nfp_reprs_resync_phys_ports(struct nfp_app *app);
 
+static inline struct net_device *nfp_repr_alloc(struct nfp_app *app)
+{
+   return nfp_repr_alloc_mqs(app, 1, 1);
+}
 #endif /* NFP_NET_REPR_H */
-- 
2.17.0



[PATCH net-next 06/14] net: sched: add qstats.qlen to qlen

2018-05-25 Thread Jakub Kicinski
AFAICT struct gnet_stats_queue.qlen is not used in Qdiscs.
It may, however, be useful for offloads to report HW queue
length there.  Add that value to the result of qdisc_qlen_sum().

Signed-off-by: Jakub Kicinski 
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 98c10a28cd01..0b786c8204b9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -350,14 +350,14 @@ static inline int qdisc_qlen(const struct Qdisc *q)
 
 static inline int qdisc_qlen_sum(const struct Qdisc *q)
 {
-   __u32 qlen = 0;
+   __u32 qlen = q->qstats.qlen;
int i;
 
if (q->flags & TCQ_F_NOLOCK) {
for_each_possible_cpu(i)
qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
} else {
-   qlen = q->q.qlen;
+   qlen += q->q.qlen;
}
 
return qlen;
-- 
2.17.0



[PATCH net-next 09/14] nfp: abm: expose the internal stats in ethtool

2018-05-25 Thread Jakub Kicinski
There is a handful of statistics exposing some internal details
of the implementation.  Expose those via ethtool.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 22 
 drivers/net/ethernet/netronome/nfp/abm/main.c | 51 +++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  2 +
 3 files changed, 75 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c 
b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index d2d9ca7a727c..79fc9147c012 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -50,6 +50,8 @@
 
 #define NFP_QMSTAT_SYM_NAME"_abi_nfdqm%u_stats"
 #define NFP_QMSTAT_STRIDE  32
+#define NFP_QMSTAT_NON_STO 0
+#define NFP_QMSTAT_STO 8
 #define NFP_QMSTAT_DROP16
 #define NFP_QMSTAT_ECN 24
 
@@ -142,6 +144,26 @@ int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link 
*alink, u32 val)
return 0;
 }
 
+u64 nfp_abm_ctrl_stat_non_sto(struct nfp_abm_link *alink, unsigned int i)
+{
+   u64 val;
+
+   if (nfp_abm_ctrl_stat(alink, alink->abm->qm_stats, NFP_QMSTAT_STRIDE,
+ NFP_QMSTAT_NON_STO, i, true, ))
+   return 0;
+   return val;
+}
+
+u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i)
+{
+   u64 val;
+
+   if (nfp_abm_ctrl_stat(alink, alink->abm->qm_stats, NFP_QMSTAT_STRIDE,
+ NFP_QMSTAT_STO, i, true, ))
+   return 0;
+   return val;
+}
+
 int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
struct nfp_alink_stats *stats)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index d0c21899a8b7..4e89159f13d3 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -497,6 +497,53 @@ static void nfp_abm_vnic_free(struct nfp_app *app, struct 
nfp_net *nn)
kfree(alink);
 }
 
+static u64 *
+nfp_abm_port_get_stats(struct nfp_app *app, struct nfp_port *port, u64 *data)
+{
+   struct nfp_repr *repr = netdev_priv(port->netdev);
+   struct nfp_abm_link *alink;
+   unsigned int i;
+
+   if (port->type != NFP_PORT_PF_PORT)
+   return data;
+   alink = repr->app_priv;
+   for (i = 0; i < alink->vnic->dp.num_r_vecs; i++) {
+   *data++ = nfp_abm_ctrl_stat_non_sto(alink, i);
+   *data++ = nfp_abm_ctrl_stat_sto(alink, i);
+   }
+   return data;
+}
+
+static int
+nfp_abm_port_get_stats_count(struct nfp_app *app, struct nfp_port *port)
+{
+   struct nfp_repr *repr = netdev_priv(port->netdev);
+   struct nfp_abm_link *alink;
+
+   if (port->type != NFP_PORT_PF_PORT)
+   return 0;
+   alink = repr->app_priv;
+   return alink->vnic->dp.num_r_vecs * 2;
+}
+
+static u8 *
+nfp_abm_port_get_stats_strings(struct nfp_app *app, struct nfp_port *port,
+  u8 *data)
+{
+   struct nfp_repr *repr = netdev_priv(port->netdev);
+   struct nfp_abm_link *alink;
+   unsigned int i;
+
+   if (port->type != NFP_PORT_PF_PORT)
+   return data;
+   alink = repr->app_priv;
+   for (i = 0; i < alink->vnic->dp.num_r_vecs; i++) {
+   data = nfp_pr_et(data, "q%u_no_wait", i);
+   data = nfp_pr_et(data, "q%u_delayed", i);
+   }
+   return data;
+}
+
 static int nfp_abm_init(struct nfp_app *app)
 {
struct nfp_pf *pf = app->pf;
@@ -575,6 +622,10 @@ const struct nfp_app_type app_abm = {
.vnic_alloc = nfp_abm_vnic_alloc,
.vnic_free  = nfp_abm_vnic_free,
 
+   .port_get_stats = nfp_abm_port_get_stats,
+   .port_get_stats_count   = nfp_abm_port_get_stats_count,
+   .port_get_stats_strings = nfp_abm_port_get_stats_strings,
+
.setup_tc   = nfp_abm_setup_tc,
 
.eswitch_mode_get   = nfp_abm_eswitch_mode_get,
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h 
b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 93a3b79cf468..09fd15847961 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -123,6 +123,8 @@ int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
struct nfp_alink_stats *stats);
 int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
 struct nfp_alink_xstats *xstats);
+u64 nfp_abm_ctrl_stat_non_sto(struct nfp_abm_link *alink, unsigned int i);
+u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
 int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
-- 
2.17.0



[PATCH net-next 07/14] nfp: abm: report statistics from RED offload

2018-05-25 Thread Jakub Kicinski
Report basic and extended RED statistics back to TC.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 114 ++
 drivers/net/ethernet/netronome/nfp/abm/main.c |  92 ++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  38 ++
 3 files changed, 244 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c 
b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 978884a0be19..d2d9ca7a727c 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -44,8 +44,15 @@
 
 #define NFP_QLVL_SYM_NAME  "_abi_nfd_out_q_lvls_%u"
 #define NFP_QLVL_STRIDE16
+#define NFP_QLVL_BLOG_BYTES0
+#define NFP_QLVL_BLOG_PKTS 4
 #define NFP_QLVL_THRS  8
 
+#define NFP_QMSTAT_SYM_NAME"_abi_nfdqm%u_stats"
+#define NFP_QMSTAT_STRIDE  32
+#define NFP_QMSTAT_DROP16
+#define NFP_QMSTAT_ECN 24
+
 static unsigned long long
 nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int queue)
 {
@@ -53,6 +60,55 @@ nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int 
queue)
(alink->queue_base + queue) * NFP_QLVL_STRIDE + NFP_QLVL_THRS;
 }
 
+static int
+nfp_abm_ctrl_stat(struct nfp_abm_link *alink, const struct nfp_rtsym *sym,
+ unsigned int stride, unsigned int offset, unsigned int i,
+ bool is_u64, u64 *res)
+{
+   struct nfp_cpp *cpp = alink->abm->app->cpp;
+   u32 val32, mur;
+   u64 val, addr;
+   int err;
+
+   mur = NFP_CPP_ATOMIC_RD(sym->target, sym->domain);
+
+   addr = sym->addr + (alink->queue_base + i) * stride + offset;
+   if (is_u64)
+   err = nfp_cpp_readq(cpp, mur, addr, );
+   else
+   err = nfp_cpp_readl(cpp, mur, addr, );
+   if (err) {
+   nfp_err(cpp,
+   "RED offload reading stat failed on vNIC %d queue %d\n",
+   alink->id, i);
+   return err;
+   }
+
+   *res = is_u64 ? val : val32;
+   return 0;
+}
+
+static int
+nfp_abm_ctrl_stat_all(struct nfp_abm_link *alink, const struct nfp_rtsym *sym,
+ unsigned int stride, unsigned int offset, bool is_u64,
+ u64 *res)
+{
+   u64 val, sum = 0;
+   unsigned int i;
+   int err;
+
+   for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+   err = nfp_abm_ctrl_stat(alink, sym, stride, offset, i,
+   is_u64, );
+   if (err)
+   return err;
+   sum += val;
+   }
+
+   *res = sum;
+   return 0;
+}
+
 static int
 nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
 {
@@ -86,6 +142,58 @@ int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, 
u32 val)
return 0;
 }
 
+int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
+   struct nfp_alink_stats *stats)
+{
+   u64 pkts = 0, bytes = 0;
+   int i, err;
+
+   for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+   pkts += nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i));
+   bytes += nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i) + 8);
+   }
+   stats->tx_pkts = pkts;
+   stats->tx_bytes = bytes;
+
+   err = nfp_abm_ctrl_stat_all(alink, alink->abm->q_lvls,
+   NFP_QLVL_STRIDE, NFP_QLVL_BLOG_BYTES,
+   false, >backlog_bytes);
+   if (err)
+   return err;
+
+   err = nfp_abm_ctrl_stat_all(alink, alink->abm->q_lvls,
+   NFP_QLVL_STRIDE, NFP_QLVL_BLOG_PKTS,
+   false, >backlog_pkts);
+   if (err)
+   return err;
+
+   err = nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+   NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+   true, >drops);
+   if (err)
+   return err;
+
+   return nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+true, >overlimits);
+}
+
+int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
+struct nfp_alink_xstats *xstats)
+{
+   int err;
+
+   err = nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+   NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+   true, >pdrop);
+   if (err)
+   return err;
+
+   return nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+true, >ecn_marked);
+}
+
 int nfp_abm_ctrl_qm_enable(struct 

[PATCH net-next 14/14] nfp: abm: report correct MQ stats

2018-05-25 Thread Jakub Kicinski
Report the stat diff to make sure MQ stats add up to child stats.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 24 +++
 1 file changed, 24 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 21d5af1fb061..1561c2724c26 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -279,6 +279,28 @@ nfp_abm_setup_tc_red(struct net_device *netdev, struct 
nfp_abm_link *alink,
}
 }
 
+static int
+nfp_abm_mq_stats(struct nfp_abm_link *alink, struct tc_mq_qopt_offload *opt)
+{
+   struct nfp_alink_stats stats;
+   unsigned int i;
+   int err;
+
+   for (i = 0; i < alink->num_qdiscs; i++) {
+   if (alink->qdiscs[i].handle == TC_H_UNSPEC)
+   continue;
+
+   err = nfp_abm_ctrl_read_q_stats(alink, i, );
+   if (err)
+   return err;
+
+   nfp_abm_update_stats(, >qdiscs[i].stats,
+>stats);
+   }
+
+   return 0;
+}
+
 static int
 nfp_abm_setup_tc_mq(struct net_device *netdev, struct nfp_abm_link *alink,
struct tc_mq_qopt_offload *opt)
@@ -292,6 +314,8 @@ nfp_abm_setup_tc_mq(struct net_device *netdev, struct 
nfp_abm_link *alink,
if (opt->handle == alink->parent)
nfp_abm_reset_root(netdev, alink, TC_H_ROOT, 0);
return 0;
+   case TC_MQ_STATS:
+   return nfp_abm_mq_stats(alink, opt);
default:
return -EOPNOTSUPP;
}
-- 
2.17.0



[PATCH net-next 08/14] nfp: allow apps to add extra stats to ports

2018-05-25 Thread Jakub Kicinski
Allow nfp apps to add extra ethtool stats.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_app.c  | 22 +++
 drivers/net/ethernet/netronome/nfp/nfp_app.h  | 13 +++
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  | 10 +++--
 drivers/net/ethernet/netronome/nfp/nfp_port.h |  2 ++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c 
b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index c9d8a7ab311e..f28b244f4ee7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -43,6 +43,7 @@
 #include "nfp_main.h"
 #include "nfp_net.h"
 #include "nfp_net_repr.h"
+#include "nfp_port.h"
 
 static const struct nfp_app_type *apps[] = {
[NFP_APP_CORE_NIC]  = _nic,
@@ -85,6 +86,27 @@ const char *nfp_app_mip_name(struct nfp_app *app)
return nfp_mip_name(app->pf->mip);
 }
 
+u64 *nfp_app_port_get_stats(struct nfp_port *port, u64 *data)
+{
+   if (!port || !port->app || !port->app->type->port_get_stats)
+   return data;
+   return port->app->type->port_get_stats(port->app, port, data);
+}
+
+int nfp_app_port_get_stats_count(struct nfp_port *port)
+{
+   if (!port || !port->app || !port->app->type->port_get_stats_count)
+   return 0;
+   return port->app->type->port_get_stats_count(port->app, port);
+}
+
+u8 *nfp_app_port_get_stats_strings(struct nfp_port *port, u8 *data)
+{
+   if (!port || !port->app || !port->app->type->port_get_stats_strings)
+   return data;
+   return port->app->type->port_get_stats_strings(port->app, port, data);
+}
+
 struct sk_buff *
 nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size, gfp_t priority)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h 
b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 23b99a4e05c2..ee74caacb015 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -90,6 +90,9 @@ extern const struct nfp_app_type app_abm;
  * @repr_stop: representor netdev stop callback
  * @check_mtu: MTU change request on a netdev (verify it is valid)
  * @repr_change_mtu:   MTU change request on repr (make and verify change)
+ * @port_get_stats:get extra ethtool statistics for a port
+ * @port_get_stats_count:  get count of extra statistics for a port
+ * @port_get_stats_strings:get strings for extra statistics
  * @start: start application logic
  * @stop:  stop application logic
  * @ctrl_msg_rx:control message handler
@@ -132,6 +135,12 @@ struct nfp_app_type {
int (*repr_change_mtu)(struct nfp_app *app, struct net_device *netdev,
   int new_mtu);
 
+   u64 *(*port_get_stats)(struct nfp_app *app,
+  struct nfp_port *port, u64 *data);
+   int (*port_get_stats_count)(struct nfp_app *app, struct nfp_port *port);
+   u8 *(*port_get_stats_strings)(struct nfp_app *app,
+ struct nfp_port *port, u8 *data);
+
int (*start)(struct nfp_app *app);
void (*stop)(struct nfp_app *app);
 
@@ -404,6 +413,10 @@ static inline struct net_device *nfp_app_repr_get(struct 
nfp_app *app, u32 id)
 
 struct nfp_app *nfp_app_from_netdev(struct net_device *netdev);
 
+u64 *nfp_app_port_get_stats(struct nfp_port *port, u64 *data);
+int nfp_app_port_get_stats_count(struct nfp_port *port);
+u8 *nfp_app_port_get_stats_strings(struct nfp_port *port, u8 *data);
+
 struct nfp_reprs *
 nfp_reprs_get_locked(struct nfp_app *app, enum nfp_repr_type type);
 struct nfp_reprs *
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index c9016419bfa0..26d1cc4e2906 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -437,7 +437,7 @@ static int nfp_net_set_ringparam(struct net_device *netdev,
return nfp_net_set_ring_size(nn, rxd_cnt, txd_cnt);
 }
 
-static __printf(2, 3) u8 *nfp_pr_et(u8 *data, const char *fmt, ...)
+__printf(2, 3) u8 *nfp_pr_et(u8 *data, const char *fmt, ...)
 {
va_list args;
 
@@ -637,6 +637,7 @@ static void nfp_net_get_strings(struct net_device *netdev,
 nn->dp.num_tx_rings,
 false);
data = nfp_mac_get_stats_strings(netdev, data);
+   data = nfp_app_port_get_stats_strings(nn->port, data);
break;
}
 }
@@ -651,6 +652,7 @@ nfp_net_get_stats(struct net_device *netdev, struct 
ethtool_stats *stats,
data = nfp_vnic_get_hw_stats(data, nn->dp.ctrl_bar,
 nn->dp.num_rx_rings, nn->dp.num_tx_rings);
data = nfp_mac_get_stats(netdev, data);
+   data = 

[PATCH net-next 12/14] nfp: abm: multi-queue RED offload

2018-05-25 Thread Jakub Kicinski
Add support for MQ offload and setting RED parameters
on queue-by-queue basis.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c |  50 -
 drivers/net/ethernet/netronome/nfp/abm/main.c | 192 ++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  14 +-
 3 files changed, 208 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c 
b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 79fc9147c012..b157ccd8c80f 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -111,8 +111,7 @@ nfp_abm_ctrl_stat_all(struct nfp_abm_link *alink, const 
struct nfp_rtsym *sym,
return 0;
 }
 
-static int
-nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
+int nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
 {
struct nfp_cpp *cpp = alink->abm->app->cpp;
u32 muw;
@@ -164,6 +163,37 @@ u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, 
unsigned int i)
return val;
 }
 
+int nfp_abm_ctrl_read_q_stats(struct nfp_abm_link *alink, unsigned int i,
+ struct nfp_alink_stats *stats)
+{
+   int err;
+
+   stats->tx_pkts = nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i));
+   stats->tx_bytes = nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i) + 8);
+
+   err = nfp_abm_ctrl_stat(alink, alink->abm->q_lvls,
+   NFP_QLVL_STRIDE, NFP_QLVL_BLOG_BYTES,
+   i, false, >backlog_bytes);
+   if (err)
+   return err;
+
+   err = nfp_abm_ctrl_stat(alink, alink->abm->q_lvls,
+   NFP_QLVL_STRIDE, NFP_QLVL_BLOG_PKTS,
+   i, false, >backlog_pkts);
+   if (err)
+   return err;
+
+   err = nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+   NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+   i, true, >drops);
+   if (err)
+   return err;
+
+   return nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+i, true, >overlimits);
+}
+
 int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
struct nfp_alink_stats *stats)
 {
@@ -200,6 +230,22 @@ int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 true, >overlimits);
 }
 
+int nfp_abm_ctrl_read_q_xstats(struct nfp_abm_link *alink, unsigned int i,
+  struct nfp_alink_xstats *xstats)
+{
+   int err;
+
+   err = nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+   NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+   i, true, >pdrop);
+   if (err)
+   return err;
+
+   return nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+i, true, >ecn_marked);
+}
+
 int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
 struct nfp_alink_xstats *xstats)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index ef77d7b0d99d..21d5af1fb061 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -58,43 +58,77 @@ static u32 nfp_abm_portid(enum nfp_repr_type rtype, 
unsigned int id)
   FIELD_PREP(NFP_ABM_PORTID_ID, id);
 }
 
-static int nfp_abm_reset_stats(struct nfp_abm_link *alink)
+static int
+__nfp_abm_reset_root(struct net_device *netdev, struct nfp_abm_link *alink,
+u32 handle, unsigned int qs, u32 init_val)
 {
-   int err;
+   struct nfp_port *port = nfp_port_from_netdev(netdev);
+   int ret;
 
-   err = nfp_abm_ctrl_read_stats(alink, >qdiscs[0].stats);
-   if (err)
-   return err;
-   alink->qdiscs[0].stats.backlog_pkts = 0;
-   alink->qdiscs[0].stats.backlog_bytes = 0;
+   ret = nfp_abm_ctrl_set_all_q_lvls(alink, init_val);
+   memset(alink->qdiscs, 0, sizeof(*alink->qdiscs) * alink->num_qdiscs);
 
-   err = nfp_abm_ctrl_read_xstats(alink, >qdiscs[0].xstats);
-   if (err)
-   return err;
+   alink->parent = handle;
+   alink->num_qdiscs = qs;
+   port->tc_offload_cnt = qs;
 
-   return 0;
+   return ret;
+}
+
+static void
+nfp_abm_reset_root(struct net_device *netdev, struct nfp_abm_link *alink,
+  u32 handle, unsigned int qs)
+{
+   __nfp_abm_reset_root(netdev, alink, handle, qs, ~0);
+}
+
+static int
+nfp_abm_red_find(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
+{
+   unsigned int i = 

[PATCH net-next 11/14] net: sched: mq: add simple offload notification

2018-05-25 Thread Jakub Kicinski
mq offload is trivial, we just need to let the device know
that the root qdisc is mq.  Alternative approach would be
to export qdisc_lookup() and make drivers check the root
type themselves, but notification via ndo_setup_tc is more
in line with other qdiscs.

Note that mq doesn't hold any stats on it's own, it just
adds up stats of its children.

Signed-off-by: Jakub Kicinski 
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h | 10 ++
 net/sched/sch_mq.c| 19 +++
 3 files changed, 30 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..29ef76360cc8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -791,6 +791,7 @@ enum tc_setup_type {
TC_SETUP_QDISC_CBS,
TC_SETUP_QDISC_RED,
TC_SETUP_QDISC_PRIO,
+   TC_SETUP_QDISC_MQ,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f3ec43725724..942f839dbca4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -778,6 +778,16 @@ struct tc_qopt_offload_stats {
struct gnet_stats_queue *qstats;
 };
 
+enum tc_mq_command {
+   TC_MQ_CREATE,
+   TC_MQ_DESTROY,
+};
+
+struct tc_mq_qopt_offload {
+   enum tc_mq_command command;
+   u32 handle;
+};
+
 enum tc_red_command {
TC_RED_REPLACE,
TC_RED_DESTROY,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f062a18e9162..6ccf6daa2503 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -23,12 +24,28 @@ struct mq_sched {
struct Qdisc**qdiscs;
 };
 
+static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
+{
+   struct net_device *dev = qdisc_dev(sch);
+   struct tc_mq_qopt_offload opt = {
+   .command = cmd,
+   .handle = sch->handle,
+   };
+
+   if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+   return -EOPNOTSUPP;
+
+   return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, );
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
 
+   mq_offload(sch, TC_MQ_DESTROY);
+
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
@@ -70,6 +87,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
}
 
sch->flags |= TCQ_F_MQROOT;
+
+   mq_offload(sch, TC_MQ_CREATE);
return 0;
 }
 
-- 
2.17.0



[PATCH net-next 00/14] nfp: abm: RED/MQ qdisc offload

2018-05-25 Thread Jakub Kicinski
Hi!

This is second batch of advanced buffer management nfp driver
changes.  This series adds the qdisc offload.  Support for
a very simple subset of RED qdisc offload is added as needed
for DCTCP ECN marking (min and max thresholds set to the same
value).

The first two patches fix glitches introduced by the previous
series.  We have to be careful about phys_port_name handling,
because VFs share the same code path, and some user space may
get confused by the names we chose.

Since unlike previous offloads we can report the queue backlog
both in bytes and packets we need to adjust how statistics are
added up in the core (patch 6).

There are some extra statistics we want to expose which don't
fit into TC stats, namely counts of packets which have been fast-
-forwarded without getting enqueued because there was no
contention and number of packets that were ever queued (sum of
all momentary backlogs).  We expose those through ethtool stats
(patches 8 and 9).

Remaining 5 patches add MQ offload - to be able to set different
configurations on different queues.  Representors are made multi-
-queue and we add offload support to MQ.  MQ stats are added up
before calling ->dump qdiscs on the children, and therefore don't
include updated offload values.  To avoid clearly incorrect stats
MQ is made to also request stats update from offloads.  This way
we can correct the diff at the driver level.


Jakub Kicinski (14):
  nfp: return -EOPNOTSUPP from .ndo_get_phys_port_name for VFs
  nfp: prefix vNIC phys_port_name with 'n'
  nfp: abm: enable advanced queuing on demand
  nfp: abm: add helpers for configuring queue marking levels
  nfp: abm: add simple RED offload
  net: sched: add qstats.qlen to qlen
  nfp: abm: report statistics from RED offload
  nfp: allow apps to add extra stats to ports
  nfp: abm: expose the internal stats in ethtool
  nfp: abm: expose all PF queues
  net: sched: mq: add simple offload notification
  nfp: abm: multi-queue RED offload
  net: sched: mq: request stats from offloads
  nfp: abm: report correct MQ stats

 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 275 +
 drivers/net/ethernet/netronome/nfp/abm/main.c | 374 +-
 drivers/net/ethernet/netronome/nfp/abm/main.h |  67 
 drivers/net/ethernet/netronome/nfp/nfp_abi.h  |  14 +
 drivers/net/ethernet/netronome/nfp/nfp_app.c  |  22 ++
 drivers/net/ethernet/netronome/nfp/nfp_app.h  |  13 +
 .../ethernet/netronome/nfp/nfp_net_common.c   |  11 +-
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  |  10 +-
 .../net/ethernet/netronome/nfp/nfp_net_repr.c |   5 +-
 .../net/ethernet/netronome/nfp/nfp_net_repr.h |   7 +-
 drivers/net/ethernet/netronome/nfp/nfp_port.h |   2 +
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |   5 +
 include/linux/netdevice.h |   1 +
 include/net/pkt_cls.h |  12 +
 include/net/sch_generic.h |   4 +-
 net/sched/sch_mq.c|  37 ++
 16 files changed, 843 insertions(+), 16 deletions(-)

-- 
2.17.0



[PATCH net-next 01/14] nfp: return -EOPNOTSUPP from .ndo_get_phys_port_name for VFs

2018-05-25 Thread Jakub Kicinski
After recent change we started returning 0 from
ndo_get_phys_port_name for VFs.  The name parameter for
ndo_get_phys_port_name is not initialized by the stack so
this can lead to a crash.  We should have kept returning
-EOPNOTSUPP in the first place.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index eea11e881bf5..1f572896d1ee 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3286,11 +3286,12 @@ nfp_net_get_phys_port_name(struct net_device *netdev, 
char *name, size_t len)
if (nn->port)
return nfp_port_get_phys_port_name(netdev, name, len);
 
-   if (!nn->dp.is_vf) {
-   n = snprintf(name, len, "%d", nn->id);
-   if (n >= len)
-   return -EINVAL;
-   }
+   if (nn->dp.is_vf)
+   return -EOPNOTSUPP;
+
+   n = snprintf(name, len, "%d", nn->id);
+   if (n >= len)
+   return -EINVAL;
 
return 0;
 }
-- 
2.17.0



[PATCH net-next 13/14] net: sched: mq: request stats from offloads

2018-05-25 Thread Jakub Kicinski
MQ doesn't hold any statistics on its own, however, statistic
from offloads are requested starting from the root, hence MQ
will read the old values for its sums.  Call into the drivers,
because of the additive nature of the stats drivers are aware
of how much "pending updates" they have to children of the MQ.
Since MQ reset its stats on every dump we can simply offset
the stats, predicting how stats of offloaded children will
change.

Signed-off-by: Jakub Kicinski 
---
 include/net/pkt_cls.h |  2 ++
 net/sched/sch_mq.c| 18 ++
 2 files changed, 20 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 942f839dbca4..a3c1a2c47cd4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -781,11 +781,13 @@ struct tc_qopt_offload_stats {
 enum tc_mq_command {
TC_MQ_CREATE,
TC_MQ_DESTROY,
+   TC_MQ_STATS,
 };
 
 struct tc_mq_qopt_offload {
enum tc_mq_command command;
u32 handle;
+   struct tc_qopt_offload_stats stats;
 };
 
 enum tc_red_command {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 6ccf6daa2503..d6b8ae4ed7a3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,6 +38,22 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command 
cmd)
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, );
 }
 
+static void mq_offload_stats(struct Qdisc *sch)
+{
+   struct net_device *dev = qdisc_dev(sch);
+   struct tc_mq_qopt_offload opt = {
+   .command = TC_MQ_STATS,
+   .handle = sch->handle,
+   .stats = {
+   .bstats = >bstats,
+   .qstats = >qstats,
+   },
+   };
+
+   if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
+   dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, );
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
struct net_device *dev = qdisc_dev(sch);
@@ -146,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes   += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
+   sch->qstats.qlen+= qdisc->qstats.qlen;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops   += qdisc->qstats.drops;
sch->qstats.requeues+= qdisc->qstats.requeues;
@@ -154,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
spin_unlock_bh(qdisc_lock(qdisc));
}
+   mq_offload_stats(sch);
 
return 0;
 }
-- 
2.17.0



[PATCH net-next 05/14] nfp: abm: add simple RED offload

2018-05-25 Thread Jakub Kicinski
Offload simple RED configurations.  For now support only DCTCP
like scenarios where min and max are the same.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 82 +++
 drivers/net/ethernet/netronome/nfp/abm/main.h | 10 +++
 2 files changed, 92 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 28a18ac62040..22251d88c958 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -38,6 +38,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "../nfpcore/nfp.h"
 #include "../nfpcore/nfp_cpp.h"
@@ -55,6 +57,84 @@ static u32 nfp_abm_portid(enum nfp_repr_type rtype, unsigned 
int id)
   FIELD_PREP(NFP_ABM_PORTID_ID, id);
 }
 
+static void
+nfp_abm_red_destroy(struct net_device *netdev, struct nfp_abm_link *alink,
+   u32 handle)
+{
+   struct nfp_port *port = nfp_port_from_netdev(netdev);
+
+   if (handle != alink->qdiscs[0].handle)
+   return;
+
+   alink->qdiscs[0].handle = TC_H_UNSPEC;
+   port->tc_offload_cnt = 0;
+   nfp_abm_ctrl_set_all_q_lvls(alink, ~0);
+}
+
+static int
+nfp_abm_red_replace(struct net_device *netdev, struct nfp_abm_link *alink,
+   struct tc_red_qopt_offload *opt)
+{
+   struct nfp_port *port = nfp_port_from_netdev(netdev);
+   int err;
+
+   if (opt->set.min != opt->set.max || !opt->set.is_ecn) {
+   nfp_warn(alink->abm->app->cpp,
+"RED offload failed - unsupported parameters\n");
+   err = -EINVAL;
+   goto err_destroy;
+   }
+   err = nfp_abm_ctrl_set_all_q_lvls(alink, opt->set.min);
+   if (err)
+   goto err_destroy;
+
+   alink->qdiscs[0].handle = opt->handle;
+   port->tc_offload_cnt = 1;
+
+   return 0;
+err_destroy:
+   if (alink->qdiscs[0].handle != TC_H_UNSPEC)
+   nfp_abm_red_destroy(netdev, alink, alink->qdiscs[0].handle);
+   return err;
+}
+
+static int
+nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
+struct tc_red_qopt_offload *opt)
+{
+   if (opt->parent != TC_H_ROOT)
+   return -EOPNOTSUPP;
+
+   switch (opt->command) {
+   case TC_RED_REPLACE:
+   return nfp_abm_red_replace(netdev, alink, opt);
+   case TC_RED_DESTROY:
+   nfp_abm_red_destroy(netdev, alink, opt->handle);
+   return 0;
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
+static int
+nfp_abm_setup_tc(struct nfp_app *app, struct net_device *netdev,
+enum tc_setup_type type, void *type_data)
+{
+   struct nfp_repr *repr = netdev_priv(netdev);
+   struct nfp_port *port;
+
+   port = nfp_port_from_netdev(netdev);
+   if (!port || port->type != NFP_PORT_PF_PORT)
+   return -EOPNOTSUPP;
+
+   switch (type) {
+   case TC_SETUP_QDISC_RED:
+   return nfp_abm_setup_tc_red(netdev, repr->app_priv, type_data);
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
 static struct net_device *nfp_abm_repr_get(struct nfp_app *app, u32 port_id)
 {
enum nfp_repr_type rtype;
@@ -403,6 +483,8 @@ const struct nfp_app_type app_abm = {
.vnic_alloc = nfp_abm_vnic_alloc,
.vnic_free  = nfp_abm_vnic_free,
 
+   .setup_tc   = nfp_abm_setup_tc,
+
.eswitch_mode_get   = nfp_abm_eswitch_mode_get,
.eswitch_mode_set   = nfp_abm_eswitch_mode_set,
 
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h 
b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 1ac651cdc140..979f98fb808b 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -58,18 +58,28 @@ struct nfp_abm {
const struct nfp_rtsym *q_lvls;
 };
 
+/**
+ * struct nfp_red_qdisc - representation of single RED Qdisc
+ * @handle:handle of currently offloaded RED Qdisc
+ */
+struct nfp_red_qdisc {
+   u32 handle;
+};
+
 /**
  * struct nfp_abm_link - port tuple of a ABM NIC
  * @abm:   back pointer to nfp_abm
  * @vnic:  data vNIC
  * @id:id of the data vNIC
  * @queue_base:id of base to host queue within PCIe (not QC idx)
+ * @qdiscs:array of qdiscs
  */
 struct nfp_abm_link {
struct nfp_abm *abm;
struct nfp_net *vnic;
unsigned int id;
unsigned int queue_base;
+   struct nfp_red_qdisc qdiscs[1];
 };
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
-- 
2.17.0



[PATCH net-next 03/14] nfp: abm: enable advanced queuing on demand

2018-05-25 Thread Jakub Kicinski
ABM NIC FW has a cut-through mode where the PCIe queuing
is bypassed, thus working like our standard NIC FWs.  Use this
mode by default and only enable queuing in switchdev mode where
users can configure it.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 13 +
 drivers/net/ethernet/netronome/nfp/abm/main.c | 11 +++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  2 ++
 drivers/net/ethernet/netronome/nfp/nfp_abi.h  | 14 ++
 4 files changed, 40 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c 
b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index e40f6f06417b..676d3afc9bdd 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -36,10 +36,23 @@
 
 #include "../nfpcore/nfp_cpp.h"
 #include "../nfp_app.h"
+#include "../nfp_abi.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
 #include "main.h"
 
+int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm)
+{
+   return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_ENABLE,
+   NULL, 0, NULL, 0);
+}
+
+int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm)
+{
+   return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_DISABLE,
+   NULL, 0, NULL, 0);
+}
+
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink)
 {
alink->queue_base = nn_readl(alink->vnic, NFP_NET_CFG_START_RXQ);
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c 
b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 5a12bb20bced..28a18ac62040 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -182,6 +182,7 @@ static enum devlink_eswitch_mode 
nfp_abm_eswitch_mode_get(struct nfp_app *app)
 static int nfp_abm_eswitch_set_legacy(struct nfp_abm *abm)
 {
nfp_abm_kill_reprs_all(abm);
+   nfp_abm_ctrl_qm_disable(abm);
 
abm->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
return 0;
@@ -200,6 +201,10 @@ static int nfp_abm_eswitch_set_switchdev(struct nfp_abm 
*abm)
struct nfp_net *nn;
int err;
 
+   err = nfp_abm_ctrl_qm_enable(abm);
+   if (err)
+   return err;
+
list_for_each_entry(nn, >vnics, vnic_list) {
struct nfp_abm_link *alink = nn->app_priv;
 
@@ -217,6 +222,7 @@ static int nfp_abm_eswitch_set_switchdev(struct nfp_abm 
*abm)
 
 err_kill_all_reprs:
nfp_abm_kill_reprs_all(abm);
+   nfp_abm_ctrl_qm_disable(abm);
return err;
 }
 
@@ -350,6 +356,11 @@ static int nfp_abm_init(struct nfp_app *app)
if (err)
goto err_free_abm;
 
+   /* We start in legacy mode, make sure advanced queuing is disabled */
+   err = nfp_abm_ctrl_qm_disable(abm);
+   if (err)
+   goto err_free_abm;
+
err = -ENOMEM;
reprs = nfp_reprs_alloc(pf->max_data_vnics);
if (!reprs)
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h 
b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 5938b69b8a84..7d129b205535 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -72,4 +72,6 @@ struct nfp_abm_link {
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
+int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
+int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_abi.h 
b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
index 7ffa6e6a9d1c..8b56c27931bf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_abi.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
@@ -59,12 +59,26 @@
  * @NFP_MBOX_POOL_SET: set shared buffer pool info/config
  * Input  - struct nfp_shared_buf_pool_info_set
  * Output - None
+ *
+ * @NFP_MBOX_PCIE_ABM_ENABLE:  enable PCIe-side advanced buffer management
+ * Enable advanced buffer management of the PCIe block.  If ABM is disabled
+ * PCIe block maintains a very short queue of buffers and does tail drop.
+ * ABM allows more advanced buffering and priority control.
+ * Input  - None
+ * Output - None
+ *
+ * @NFP_MBOX_PCIE_ABM_DISABLE: disable PCIe-side advanced buffer management
+ * Input  - None
+ * Output - None
  */
 enum nfp_mbox_cmd {
NFP_MBOX_NO_CMD = 0x00,
 
NFP_MBOX_POOL_GET   = 0x01,
NFP_MBOX_POOL_SET   = 0x02,
+
+   NFP_MBOX_PCIE_ABM_ENABLE= 0x03,
+   NFP_MBOX_PCIE_ABM_DISABLE   = 0x04,
 };
 
 #define NFP_SHARED_BUF_COUNT_SYM_NAME  "_abi_nfd_pf%u_sb_cnt"
-- 
2.17.0



Re: [PATCH] net: netsec: reduce DMA mask to 40 bits

2018-05-25 Thread Jassi Brar
On 26 May 2018 at 08:56, Jassi Brar  wrote:
> On 26 May 2018 at 01:07, Robin Murphy  wrote:
>> On Sat, 26 May 2018 00:33:05 +0530
>> Jassi Brar  wrote:
>>
>>> On 25 May 2018 at 18:20, Ard Biesheuvel 
>>> wrote:
>>> > The netsec network controller IP can drive 64 address bits for DMA,
>>> > and the DMA mask is set accordingly in the driver. However, the
>>> > SynQuacer SoC, which is the only silicon incorporating this IP at
>>> > the moment, integrates this IP in a manner that leaves address bits
>>> > [63:40] unconnected.
>>> >
>>> > Up until now, this has not resulted in any problems, given that the
>>> > DDR controller doesn't decode those bits to begin with. However,
>>> > recent firmware updates for platforms incorporating this SoC allow
>>> > the IOMMU to be enabled, which does decode address bits [47:40],
>>> > and allocates top down from the IOVA space, producing DMA addresses
>>> > that have bits set that have been left unconnected.
>>> >
>>> > Both the DT and ACPI (IORT) descriptions of the platform take this
>>> > into account, and only describe a DMA address space of 40 bits
>>> > (using either dma-ranges DT properties, or DMA address limits in
>>> > IORT named component nodes). However, even though our IOMMU and bus
>>> > layers may take such limitations into account by setting a narrower
>>> > DMA mask when creating the platform device, the netsec probe()
>>> > entrypoint follows the common practice of setting the DMA mask
>>> > uncondionally, according to the capabilities of the IP block itself
>>> > rather than to its integration into the chip.
>>> >
>>> > It is currently unclear what the correct fix is here. We could hack
>>> > around it by only setting the DMA mask if it deviates from its
>>> > default value of DMA_BIT_MASK(32). However, this makes it
>>> > impossible for the bus layer to use DMA_BIT_MASK(32) as the bus
>>> > limit, and so it appears that a more comprehensive approach is
>>> > required to take DMA limits imposed by the SoC as a whole into
>>> > account.
>>> >
>>> > In the mean time, let's limit the DMA mask to 40 bits. Given that
>>> > there is currently only one SoC that incorporates this IP, this is
>>> > a reasonable approach that can be backported to -stable and buys us
>>> > some time to come up with a proper fix going forward.
>>> >
>>> I am sure you already thought about it, but why not let the platform
>>> specify the bit mask for the driver (via some "bus-width" property),
>>> to override the default 64 bit mask?
>>
>> Because lack of a property to describe the integration is not the
>> problem. There are already at least two ways: the general DT/IORT
>> properties for describing DMA addressing - which it would be a bit
>> ungainly for a driver to parse for this reason, but not impossible -
> 
>
>
>> and inferring it from a SoC-specific compatible - which is more
>> appropriate, and what we happen to be able to do here.
>>
> Sorry, I am not sure I follow. This patch changes from 64-bits default
> to 40-bits capability without checking for the parent SoC. If the next
> generation implements the full 64-bit or just 32-bit bus, we'll be
> back in the pit again. No?
>
Probably you meant we'll change the ethernet compatible string for
differently capable SoC. OK, but here it is more of integration issue
than controller version.

Which makes me realise the extant compatible property for netsec is
not so correct (it embeds the platform name). So I am ok either way.

Thanks.


Re: [PATCH] net: netsec: reduce DMA mask to 40 bits

2018-05-25 Thread Jassi Brar
On 26 May 2018 at 01:07, Robin Murphy  wrote:
> On Sat, 26 May 2018 00:33:05 +0530
> Jassi Brar  wrote:
>
>> On 25 May 2018 at 18:20, Ard Biesheuvel 
>> wrote:
>> > The netsec network controller IP can drive 64 address bits for DMA,
>> > and the DMA mask is set accordingly in the driver. However, the
>> > SynQuacer SoC, which is the only silicon incorporating this IP at
>> > the moment, integrates this IP in a manner that leaves address bits
>> > [63:40] unconnected.
>> >
>> > Up until now, this has not resulted in any problems, given that the
>> > DDR controller doesn't decode those bits to begin with. However,
>> > recent firmware updates for platforms incorporating this SoC allow
>> > the IOMMU to be enabled, which does decode address bits [47:40],
>> > and allocates top down from the IOVA space, producing DMA addresses
>> > that have bits set that have been left unconnected.
>> >
>> > Both the DT and ACPI (IORT) descriptions of the platform take this
>> > into account, and only describe a DMA address space of 40 bits
>> > (using either dma-ranges DT properties, or DMA address limits in
>> > IORT named component nodes). However, even though our IOMMU and bus
>> > layers may take such limitations into account by setting a narrower
>> > DMA mask when creating the platform device, the netsec probe()
>> > entrypoint follows the common practice of setting the DMA mask
>> > uncondionally, according to the capabilities of the IP block itself
>> > rather than to its integration into the chip.
>> >
>> > It is currently unclear what the correct fix is here. We could hack
>> > around it by only setting the DMA mask if it deviates from its
>> > default value of DMA_BIT_MASK(32). However, this makes it
>> > impossible for the bus layer to use DMA_BIT_MASK(32) as the bus
>> > limit, and so it appears that a more comprehensive approach is
>> > required to take DMA limits imposed by the SoC as a whole into
>> > account.
>> >
>> > In the mean time, let's limit the DMA mask to 40 bits. Given that
>> > there is currently only one SoC that incorporates this IP, this is
>> > a reasonable approach that can be backported to -stable and buys us
>> > some time to come up with a proper fix going forward.
>> >
>> I am sure you already thought about it, but why not let the platform
>> specify the bit mask for the driver (via some "bus-width" property),
>> to override the default 64 bit mask?
>
> Because lack of a property to describe the integration is not the
> problem. There are already at least two ways: the general DT/IORT
> properties for describing DMA addressing - which it would be a bit
> ungainly for a driver to parse for this reason, but not impossible -



> and inferring it from a SoC-specific compatible - which is more
> appropriate, and what we happen to be able to do here.
>
Sorry, I am not sure I follow. This patch changes from 64-bits default
to 40-bits capability without checking for the parent SoC. If the next
generation implements the full 64-bit or just 32-bit bus, we'll be
back in the pit again. No?

Thanks.


[PATCH] PCI: reset driver SR-IOV state after remove

2018-05-25 Thread Jakub Kicinski
Bjorn points out that currently core and most of the drivers don't
clean up dev->sriov->driver_max_VFs settings on .remove().  This
means that if a different driver is bound afterwards it will
inherit the old setting:

  - load PF driver 1
  - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
  - unload PF driver 1
  - load PF driver 2

Reset driver_max_VFs back to total_VFs after device remove.

Signed-off-by: Jakub Kicinski 
---
I gave into the temptation and also added a warning about SR-IOV
being on after remove :)  Please let me know if this is anywhere
close to what you had in mind!

 drivers/pci/iov.c| 16 
 drivers/pci/pci-driver.c |  1 +
 drivers/pci/pci.h|  4 
 3 files changed, 21 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index db86fd26f8e1..5d0f560a1e28 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -574,6 +574,22 @@ void pci_iov_release(struct pci_dev *dev)
sriov_release(dev);
 }
 
+/**
+ * pci_sriov_drv_cleanup - clean up SR-IOV state after PF driver is detached
+ * @dev: the PCI device
+ */
+void pci_sriov_drv_cleanup(struct pci_dev *dev)
+{
+   struct pci_sriov *iov = dev->sriov;
+
+   if (!dev->is_physfn)
+   return;
+   iov->driver_max_VFs = iov->total_VFs;
+   if (iov->num_VFs)
+   dev_warn(>dev,
+"driver left SR-IOV enabled after remove\n");
+}
+
 /**
  * pci_iov_update_resource - update a VF BAR
  * @dev: the PCI device
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index b9a131137e64..932a1acf7b1b 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -443,6 +443,7 @@ static int pci_device_remove(struct device *dev)
}
pcibios_free_irq(pci_dev);
pci_dev->driver = NULL;
+   pci_sriov_drv_cleanup(pci_dev);
}
 
/* Undo the runtime PM settings in local_pci_probe() */
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf25bff..5fa6d19762bd 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -311,6 +311,7 @@ static inline void pci_restore_ats_state(struct pci_dev 
*dev)
 #ifdef CONFIG_PCI_IOV
 int pci_iov_init(struct pci_dev *dev);
 void pci_iov_release(struct pci_dev *dev);
+void pci_sriov_drv_cleanup(struct pci_dev *dev);
 void pci_iov_update_resource(struct pci_dev *dev, int resno);
 resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
 void pci_restore_iov_state(struct pci_dev *dev);
@@ -323,6 +324,9 @@ static inline int pci_iov_init(struct pci_dev *dev)
 }
 static inline void pci_iov_release(struct pci_dev *dev)
 
+{
+}
+static inline void pci_sriov_drv_cleanup(struct pci_dev *dev)
 {
 }
 static inline void pci_restore_iov_state(struct pci_dev *dev)
-- 
2.17.0



Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress

2018-05-25 Thread Jakub Kicinski
On Fri, 25 May 2018 08:48:09 +0200, Jiri Pirko wrote:
> Thu, May 24, 2018 at 04:22:47AM CEST, jakub.kicin...@netronome.com wrote:
> >Hi!
> >
> >This series from John adds bond offload to the nfp driver.  Patch 5
> >exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
> >hashing matches that of the software LAG.  This may be unnecessarily
> >conservative, let's see what LAG maintainers think :)  
> 
> So you need to restrict offload to only certain hash algo? In mlxsw, we
> just ignore the lag setting and do some hw default hashing. Would not be
> enough? Note that there's a good reason for it, as you see, in team, the
> hashing is done in a BPF function and could be totally arbitrary.
> Your patchset effectively disables team offload for nfp.

My understanding is that the project requirements only called for L3/L4
hash algorithm offload, hence the temptation to err on the side of
caution and not offload all the bond configurations.  John can provide
more details.  Not being able to offload team is unfortunate indeed.


Re: [PATCH net-next] bpfilter: fix a build err

2018-05-25 Thread YueHaibing
On 2018/5/26 0:19, Alexei Starovoitov wrote:
> On Fri, May 25, 2018 at 06:17:57PM +0800, YueHaibing wrote:
>> gcc-7.3.0 report following err:
>>
>>   HOSTCC  net/bpfilter/main.o
>> In file included from net/bpfilter/main.c:9:0:
>> ./include/uapi/linux/bpf.h:12:10: fatal error: linux/bpf_common.h: No such 
>> file or directory
>>  #include 
>>
>> remove it by adding a include path.
>> Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
>>
>> Signed-off-by: YueHaibing 
>> ---
>>  net/bpfilter/Makefile | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
>> index 2af752c..3f3cb87 100644
>> --- a/net/bpfilter/Makefile
>> +++ b/net/bpfilter/Makefile
>> @@ -5,7 +5,7 @@
>>  
>>  hostprogs-y := bpfilter_umh
>>  bpfilter_umh-objs := main.o
>> -HOSTCFLAGS += -I. -Itools/include/
>> +HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
> 
> Strangely I don't see this error with gcc 7.3
> I've tried this patch and it doesn't hurt,
> but before it gets applied could you please try
> the top two patches from this tree:
> https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/?h=ipt_bpf
> in your environment?
> These two patches add the actual meat of bpfilter and I'd like
> to make sure the build setup is good for everyone before
> we proceed too far.

after applied these two patches on net-next, the err still here:
 bpfilter: rough bpfilter codegen example hack
 bpfilter: add iptable get/set parsing

  HOSTCC  net/bpfilter/main.o
In file included from net/bpfilter/main.c:13:0:
./include/uapi/linux/bpf.h:12:10: fatal error: linux/bpf_common.h: No such file 
or directory
 #include 
  ^~~~
compilation terminated.
make[2]: *** [net/bpfilter/main.o] Error 1
make[1]: *** [net/bpfilter] Error 2
make: *** [net] Error 2

Also I compile your tree, error is same

my gcc version info as follow:
[root@localhost net-next]# gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/home/yuehb/gcc-7.3.0-tools/libexec/gcc/x86_64-pc-linux-gnu/7.3.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../gcc-7.3.0/configure --enable-checking=release 
--enable-languages=c,c++
--disable-multilib --prefix=/home/yuehb/gcc-7.3.0-tools
Thread model: posix
gcc version 7.3.0 (GCC)

> 
> 
> .
> 



Re: [PATCH] IB: Revert "remove redundant INFINIBAND kconfig dependencies"

2018-05-25 Thread Greg Thelen
On Fri, May 25, 2018 at 2:32 PM Arnd Bergmann  wrote:

> Several subsystems depend on INFINIBAND_ADDR_TRANS, which in turn depends
> on INFINIBAND. However, when with CONFIG_INIFIBAND=m, this leads to a
> link error when another driver using it is built-in. The
> INFINIBAND_ADDR_TRANS dependency is insufficient here as this is
> a 'bool' symbol that does not force anything to be a module in turn.

> fs/cifs/smbdirect.o: In function `smbd_disconnect_rdma_work':
> smbdirect.c:(.text+0x1e4): undefined reference to `rdma_disconnect'
> net/9p/trans_rdma.o: In function `rdma_request':
> trans_rdma.c:(.text+0x7bc): undefined reference to `rdma_disconnect'
> net/9p/trans_rdma.o: In function `rdma_destroy_trans':
> trans_rdma.c:(.text+0x830): undefined reference to `ib_destroy_qp'
> trans_rdma.c:(.text+0x858): undefined reference to `ib_dealloc_pd'

> Fixes: 9533b292a7ac ("IB: remove redundant INFINIBAND kconfig
dependencies")
> Signed-off-by: Arnd Bergmann 

Acked-by: Greg Thelen 

Sorry for the 9533b292a7ac problem.
At this point the in release cycle, I think Arnd's revert is best.

If there is interest, I've put a little thought into an alternative fix:
making INFINIBAND_ADDR_TRANS tristate.  But it's nontrivial.
So I prefer this simple revert for now.

Doug: do you need anything from me on this?

> ---
> The patch that introduced the problem has been queued in the
> rdma-fixes/for-rc tree. Please revert the patch before sending
> the branch to Linus.
> ---
> drivers/infiniband/ulp/srpt/Kconfig | 2 +-
> drivers/nvme/host/Kconfig   | 2 +-
> drivers/nvme/target/Kconfig | 2 +-
> drivers/staging/lustre/lnet/Kconfig | 2 +-
> fs/cifs/Kconfig | 2 +-
> net/9p/Kconfig  | 2 +-
> net/rds/Kconfig | 2 +-
> net/sunrpc/Kconfig  | 2 +-
> 8 files changed, 8 insertions(+), 8 deletions(-)

> diff --git a/drivers/infiniband/ulp/srpt/Kconfig
b/drivers/infiniband/ulp/srpt/Kconfig
> index 25bf6955b6d0..fb8b7182f05e 100644
> --- a/drivers/infiniband/ulp/srpt/Kconfig
> +++ b/drivers/infiniband/ulp/srpt/Kconfig
> @@ -1,6 +1,6 @@
> config INFINIBAND_SRPT
>tristate "InfiniBand SCSI RDMA Protocol target support"
> -   depends on INFINIBAND_ADDR_TRANS && TARGET_CORE
> +   depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
>---help---

>  Support for the SCSI RDMA Protocol (SRP) Target driver. The
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index dbb7464c018c..88a8b5916624 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -27,7 +27,7 @@ config NVME_FABRICS

> config NVME_RDMA
>tristate "NVM Express over Fabrics RDMA host driver"
> -   depends on INFINIBAND_ADDR_TRANS && BLOCK
> +   depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
>select NVME_CORE
>select NVME_FABRICS
>select SG_POOL
> diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
> index 7595664ee753..3c7b61ddb0d1 100644
> --- a/drivers/nvme/target/Kconfig
> +++ b/drivers/nvme/target/Kconfig
> @@ -27,7 +27,7 @@ config NVME_TARGET_LOOP

> config NVME_TARGET_RDMA
>tristate "NVMe over Fabrics RDMA target support"
> -   depends on INFINIBAND_ADDR_TRANS
> +   depends on INFINIBAND && INFINIBAND_ADDR_TRANS
>depends on NVME_TARGET
>select SGL_ALLOC
>help
> diff --git a/drivers/staging/lustre/lnet/Kconfig
b/drivers/staging/lustre/lnet/Kconfig
> index f3b1ad4bd3dc..ad049e6f24e4 100644
> --- a/drivers/staging/lustre/lnet/Kconfig
> +++ b/drivers/staging/lustre/lnet/Kconfig
> @@ -34,7 +34,7 @@ config LNET_SELFTEST

> config LNET_XPRT_IB
>tristate "LNET infiniband support"
> -   depends on LNET && PCI && INFINIBAND_ADDR_TRANS
> +   depends on LNET && PCI && INFINIBAND && INFINIBAND_ADDR_TRANS
>default LNET && INFINIBAND
>help
>  This option allows the LNET users to use infiniband as an
> diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
> index d61e2de8d0eb..5f132d59dfc2 100644
> --- a/fs/cifs/Kconfig
> +++ b/fs/cifs/Kconfig
> @@ -197,7 +197,7 @@ config CIFS_SMB311

> config CIFS_SMB_DIRECT
>bool "SMB Direct support (Experimental)"
> -   depends on CIFS=m && INFINIBAND_ADDR_TRANS || CIFS=y &&
INFINIBAND_ADDR_TRANS=y
> +   depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS ||
CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
>help
>  Enables SMB Direct experimental support for SMB 3.0, 3.02 and
3.1.1.
>  SMB Direct allows transferring SMB packets over RDMA. If
unsure,
> diff --git a/net/9p/Kconfig b/net/9p/Kconfig
> index 46c39f7da444..e6014e0e51f7 100644
> --- a/net/9p/Kconfig
> +++ b/net/9p/Kconfig
> @@ -32,7 +32,7 @@ config NET_9P_XEN


> 

[for-next 01/12] net/mlx5: E-Switch, Reorganize and rename fdb flow tables

2018-05-25 Thread Saeed Mahameed
From: Chris Mi 

We have several fdb flow tables for each of the legacy and switchdev
modes. In the switchdev mode, there are fast path and slow path flow
tables. Towards adding more flow tables in upcoming patches, reorganize
and rename the various existing ones to reflect their functionality.

Signed-off-by: Chris Mi 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c | 22 +--
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  5 +++--
 .../mellanox/mlx5/core/eswitch_offloads.c | 22 +--
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 09f0e11c6ffc..6cab1dd66d1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -200,7 +200,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 
vport, bool rx_rule,
spec->match_criteria_enable = match_header;
flow_act.action =  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
flow_rule =
-   mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
+   mlx5_add_flow_rules(esw->fdb_table.legacy.fdb, spec,
_act, , 1);
if (IS_ERR(flow_rule)) {
esw_warn(esw->dev,
@@ -282,7 +282,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch 
*esw, int nvports)
esw_warn(dev, "Failed to create FDB Table err %d\n", err);
goto out;
}
-   esw->fdb_table.fdb = fdb;
+   esw->fdb_table.legacy.fdb = fdb;
 
/* Addresses group : Full match unicast/multicast addresses */
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -343,9 +343,9 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch 
*esw, int nvports)
mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
esw->fdb_table.legacy.addr_grp = NULL;
}
-   if (!IS_ERR_OR_NULL(esw->fdb_table.fdb)) {
-   mlx5_destroy_flow_table(esw->fdb_table.fdb);
-   esw->fdb_table.fdb = NULL;
+   if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.fdb)) {
+   mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+   esw->fdb_table.legacy.fdb = NULL;
}
}
 
@@ -355,15 +355,15 @@ static int esw_create_legacy_fdb_table(struct 
mlx5_eswitch *esw, int nvports)
 
 static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw)
 {
-   if (!esw->fdb_table.fdb)
+   if (!esw->fdb_table.legacy.fdb)
return;
 
esw_debug(esw->dev, "Destroy FDB Table\n");
mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp);
mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
-   mlx5_destroy_flow_table(esw->fdb_table.fdb);
-   esw->fdb_table.fdb = NULL;
+   mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+   esw->fdb_table.legacy.fdb = NULL;
esw->fdb_table.legacy.addr_grp = NULL;
esw->fdb_table.legacy.allmulti_grp = NULL;
esw->fdb_table.legacy.promisc_grp = NULL;
@@ -396,7 +396,7 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct 
vport_addr *vaddr)
 
 fdb_add:
/* SRIOV is enabled: Forward UC MAC to vport */
-   if (esw->fdb_table.fdb && esw->mode == SRIOV_LEGACY)
+   if (esw->fdb_table.legacy.fdb && esw->mode == SRIOV_LEGACY)
vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
 
esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
@@ -486,7 +486,7 @@ static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct 
vport_addr *vaddr)
u8 *mac = vaddr->node.addr;
u32 vport = vaddr->vport;
 
-   if (!esw->fdb_table.fdb)
+   if (!esw->fdb_table.legacy.fdb)
return 0;
 
esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
@@ -526,7 +526,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct 
vport_addr *vaddr)
u8 *mac = vaddr->node.addr;
u32 vport = vaddr->vport;
 
-   if (!esw->fdb_table.fdb)
+   if (!esw->fdb_table.legacy.fdb)
return 0;
 
esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index f47a14e31b7d..d1a3f7fcca1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -117,16 +117,17 @@ struct mlx5_vport {
 };
 
 struct mlx5_eswitch_fdb {
-   void *fdb;
union {
struct legacy_fdb {
+  

[pull request][for-next 00/12] Mellanox, mlx5e updates 2018-05-25

2018-05-25 Thread Saeed Mahameed
Hi dave,

This is a mlx5e only pull request, for more information please see tag
log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.



The following changes since commit e52cde71709348c0d67bf0f213b438fa4d6cf9a9:

  net: dsa: dsa_loop: Make dynamic debugging helpful (2018-05-25 16:46:29 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-05-25

for you to fetch changes up to 05909babce5328f468f7ac3a1033431c895f97a5:

  net/mlx5e: Avoid reset netdev stats on configuration changes (2018-05-25 
16:14:28 -0700)


mlx5e-updates-2018-05-25

This series includes updates for mlx5e netdev driver.

1) Allowr flow based VF vport mirroring under sriov switchdev scheme,
added support for offloading the TC mirred mirror sub-action, from
Chris Mi.

=
From: Or Gerlitz 

The user will typically set the actions order such that the mirror
port (mirror VF) sees packets as the original port (VF under
mirroring) sent them or as it will receive them. In the general case,
it means that packets are potentially sent to the mirror port before
or after some actions were applied on them.

To properly do that, we follow on the exact action order as set for
the flow and make sure this will also be the case when we program the
HW offload.

If all the actions should apply before forwarding to the mirror and dest port,
mirroring is just multicasting to the two vports. Otherwise, we split
the TC flow to two HW rules, where the 1st applies only the actions
needed up to the mirror (if there are such) and the 2nd the rest of
the actions plus the forwarding to the dest vport.
=

2) Move to order-0 only allocations (using fragmented work queues) for all
work queues used by the driver, RX and TX descriptor rings
(RQs, SQs and Completion Queues (CQs)), from Tariq Toukan.

3) Avoid resetting netdevice statistics on netdevice
state changes, from Eran Ben Elisha.


Chris Mi (5):
  net/mlx5: E-Switch, Reorganize and rename fdb flow tables
  net/mlx5: Add cap bits for flow table destination in FDB table
  net/mlx5: E-switch, Create a second level FDB flow table
  net/mlx5e: Parse mirroring action for offloaded TC eswitch flows
  net/mlx5e: Split offloaded eswitch TC rules for port mirroring

Eran Ben Elisha (1):
  net/mlx5e: Avoid reset netdev stats on configuration changes

Saeed Mahameed (1):
  net/mlx5e: Move phy link down events counter out of SW stats

Shalom Lagziel (1):
  net/mlx5e: Introducing new statistics rwlock

Tariq Toukan (4):
  net/mlx5e: Use WQ API functions instead of direct fields access
  net/mlx5e: TX, Use actual WQE size for SQ edge fill
  net/mlx5i: Use compilation flag in IPOIB header
  net/mlx5: Use order-0 allocations for all WQ types

 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  30 ++-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  88 ---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  20 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c| 126 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 101 
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   5 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  83 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c| 262 +
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  22 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  23 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 133 ---
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c|  14 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.h|   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  |  26 ++
 drivers/net/ethernet/mellanox/mlx5/core/wq.c   |  94 +---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h   |  50 ++--
 include/linux/mlx5/driver.h|  16 +-
 include/linux/mlx5/mlx5_ifc.h  |   4 +-
 21 files changed, 724 insertions(+), 387 deletions(-)


[for-next 04/12] net/mlx5e: Parse mirroring action for offloaded TC eswitch flows

2018-05-25 Thread Saeed Mahameed
From: Chris Mi 

Currently, we only support the mirred redirect TC sub-action. In order
to support flow based vport mirroring, add support to parse the mirred
mirror sub-action.

For mirroring, user-space will typically set the action order such that
the mirror port (mirror VF) sees packets as the original port (VF under
mirroring) sent them or as it will receive them.

In the general case, it means that packets are potentially sent to the
mirror port before or after some actions were applied on them. To
properly do that, we should follow on the exact action order as set for
the flow and make sure this will also be the case when we program the HW
offload.

We introduce a counter for the output ports (attr->out_count), which we
increase when parsing each mirred redirect/mirror sub-action and when
dealing with encap.

We introduce a counter (attr->mirror_count) telling us if split is
needed. If no split is needed and mirroring is just multicasting to
vport, the mirror count is zero, all the actions of the TC flow should
apply on that single HW flow.

If split is needed, the mirror count tells where to do the split, all
non-mirred tc actions should apply only after the split.

The mirror count is set while parsing the following actions encap/decap,
header re-write, vlan push/pop.

Signed-off-by: Chris Mi 
Reviewed-by: Paul Blakey 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 26 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.h | 10 +--
 .../mellanox/mlx5/core/eswitch_offloads.c | 28 ++-
 3 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a9c96fe8e4fe..302c5500f9ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -844,8 +844,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
}
out_priv = netdev_priv(encap_dev);
rpriv = out_priv->ppriv;
-   attr->out_rep = rpriv->rep;
-   attr->out_mdev = out_priv->mdev;
+   attr->out_rep[attr->out_count] = rpriv->rep;
+   attr->out_mdev[attr->out_count++] = out_priv->mdev;
}
 
err = mlx5_eswitch_add_vlan_action(esw, attr);
@@ -2537,6 +2537,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
return err;
 
action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+   attr->mirror_count = attr->out_count;
continue;
}
 
@@ -2548,12 +2549,18 @@ static int parse_tc_fdb_actions(struct mlx5e_priv 
*priv, struct tcf_exts *exts,
return -EOPNOTSUPP;
}
 
-   if (is_tcf_mirred_egress_redirect(a)) {
-   struct net_device *out_dev;
+   if (is_tcf_mirred_egress_redirect(a) || 
is_tcf_mirred_egress_mirror(a)) {
struct mlx5e_priv *out_priv;
+   struct net_device *out_dev;
 
out_dev = tcf_mirred_dev(a);
 
+   if (attr->out_count >= MLX5_MAX_FLOW_FWD_VPORTS) {
+   pr_err("can't support more than %d output 
ports, can't offload forwarding\n",
+  attr->out_count);
+   return -EOPNOTSUPP;
+   }
+
if (switchdev_port_same_parent_id(priv->netdev,
  out_dev) ||
is_merged_eswitch_dev(priv, out_dev)) {
@@ -2561,8 +2568,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
  MLX5_FLOW_CONTEXT_ACTION_COUNT;
out_priv = netdev_priv(out_dev);
rpriv = out_priv->ppriv;
-   attr->out_rep = rpriv->rep;
-   attr->out_mdev = out_priv->mdev;
+   attr->out_rep[attr->out_count] = rpriv->rep;
+   attr->out_mdev[attr->out_count++] = 
out_priv->mdev;
} else if (encap) {
parse_attr->mirred_ifindex = out_dev->ifindex;
parse_attr->tun_info = *info;
@@ -2585,6 +2592,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
encap = true;
else
return -EOPNOTSUPP;
+   attr->mirror_count = attr->out_count;
continue;
 

[for-next 05/12] net/mlx5e: Split offloaded eswitch TC rules for port mirroring

2018-05-25 Thread Saeed Mahameed
From: Chris Mi 

If a TC rule needs to be split for mirroring, create two HW rules,
in the first level and the second level flow tables accordingly.

In the first level flow table, forward the packet to the mirror
port and forward the packet to the second level flow table for
further processing, eg. encap, vlan push or header re-write.

Currently the matching is repeated in both stages.

While here, simplify the setup of the vhca id valid indicator also
in the existing code.

Signed-off-by: Chris Mi 
Reviewed-by: Paul Blakey 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 57 
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  4 ++
 .../mellanox/mlx5/core/eswitch_offloads.c | 68 +--
 3 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 302c5500f9ad..9372d914abe5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -75,12 +75,14 @@ enum {
MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
 };
 
+#define MLX5E_TC_MAX_SPLITS 1
+
 struct mlx5e_tc_flow {
struct rhash_head   node;
struct mlx5e_priv   *priv;
u64 cookie;
u8  flags;
-   struct mlx5_flow_handle *rule;
+   struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
struct list_headencap;   /* flows sharing the same encap ID */
struct list_headmod_hdr; /* flows sharing the same mod hdr ID */
struct list_headhairpin; /* flows sharing the same hairpin */
@@ -794,8 +796,8 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
struct mlx5_nic_flow_attr *attr = flow->nic_attr;
struct mlx5_fc *counter = NULL;
 
-   counter = mlx5_flow_rule_counter(flow->rule);
-   mlx5_del_flow_rules(flow->rule);
+   counter = mlx5_flow_rule_counter(flow->rule[0]);
+   mlx5_del_flow_rules(flow->rule[0]);
mlx5_fc_destroy(priv->mdev, counter);
 
if (!mlx5e_tc_num_filters(priv) && priv->fs.tc.t) {
@@ -870,9 +872,18 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
rule = mlx5_eswitch_add_offloaded_rule(esw, _attr->spec, 
attr);
if (IS_ERR(rule))
goto err_add_rule;
+
+   if (attr->mirror_count) {
+   flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, 
_attr->spec, attr);
+   if (IS_ERR(flow->rule[1]))
+   goto err_fwd_rule;
+   }
}
return rule;
 
+err_fwd_rule:
+   mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+   rule = flow->rule[1];
 err_add_rule:
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
mlx5e_detach_mod_hdr(priv, flow);
@@ -893,7 +904,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
-   mlx5_eswitch_del_offloaded_rule(esw, flow->rule, attr);
+   if (attr->mirror_count)
+   mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], 
attr);
+   mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
}
 
mlx5_eswitch_del_vlan_action(esw, attr);
@@ -929,13 +942,25 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
list_for_each_entry(flow, >flows, encap) {
esw_attr = flow->esw_attr;
esw_attr->encap_id = e->encap_id;
-   flow->rule = mlx5_eswitch_add_offloaded_rule(esw, 
_attr->parse_attr->spec, esw_attr);
-   if (IS_ERR(flow->rule)) {
-   err = PTR_ERR(flow->rule);
+   flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, 
_attr->parse_attr->spec, esw_attr);
+   if (IS_ERR(flow->rule[0])) {
+   err = PTR_ERR(flow->rule[0]);
mlx5_core_warn(priv->mdev, "Failed to update cached 
encapsulation flow, %d\n",
   err);
continue;
}
+
+   if (esw_attr->mirror_count) {
+   flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, 
_attr->parse_attr->spec, esw_attr);
+   if (IS_ERR(flow->rule[1])) {
+   mlx5_eswitch_del_offloaded_rule(esw, 
flow->rule[0], esw_attr);
+   err = PTR_ERR(flow->rule[1]);
+   mlx5_core_warn(priv->mdev, "Failed to update 
cached mirror flow, %d\n",
+  err);
+   

[for-next 02/12] net/mlx5: Add cap bits for flow table destination in FDB table

2018-05-25 Thread Saeed Mahameed
From: Chris Mi 

If set, the FDB table supports the forward action with a
destination list that includes a flow table.

Signed-off-by: Chris Mi 
Reviewed-by: Paul Blakey 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index edbddeaacc88..05b480fae27d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -524,7 +524,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 };
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-   u8 reserved_at_0[0x200];
+   u8  reserved_at_0[0x1c];
+   u8  fdb_multi_path_to_table[0x1];
+   u8  reserved_at_1d[0x1e3];
 
struct mlx5_ifc_flow_table_prop_layout_bits 
flow_table_properties_nic_esw_fdb;
 
-- 
2.17.0



[for-next 11/12] net/mlx5e: Introducing new statistics rwlock

2018-05-25 Thread Saeed Mahameed
From: Shalom Lagziel 

Introduce a new read/write lock that will protect statistics gathering from
netdev channels configuration changes.
e.g. when channels are being replaced (increase/decrease number of rings)
prevent statistic gathering (ndo_get_stats64) to read the statistics of
in-active channels (channels that are being closed).

Plus update channels software statistics on the fly when calling
ndo_get_stats64, and remove it from stats periodic work.

Fixes: 9218b44dcc05 ("net/mlx5e: Statistics handling refactoring")
Signed-off-by: Shalom Lagziel 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h |  2 ++
 .../net/ethernet/mellanox/mlx5/core/en_main.c|  8 
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 16 +---
 .../net/ethernet/mellanox/mlx5/core/en_stats.c   |  8 ++--
 .../net/ethernet/mellanox/mlx5/core/en_stats.h   |  2 ++
 5 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9396db54973f..c3c79f2835d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -771,6 +771,8 @@ struct mlx5e_priv {
struct mutex   state_lock; /* Protects Interface state */
struct mlx5e_rqdrop_rq;
 
+   rwlock_t   stats_lock; /* Protects channels SW stats 
updates */
+   bool   channels_active;
struct mlx5e_channels  channels;
u32tisn[MLX5E_MAX_NUM_TC];
struct mlx5e_rqt   indir_rqt;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0c167e5fc346..0e9c64580abb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2658,6 +2658,9 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 
mlx5e_build_channels_tx_maps(priv);
mlx5e_activate_channels(>channels);
+   write_lock(>stats_lock);
+   priv->channels_active = true;
+   write_unlock(>stats_lock);
netif_tx_start_all_queues(priv->netdev);
 
if (MLX5_VPORT_MANAGER(priv->mdev))
@@ -2679,6 +2682,9 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv 
*priv)
 */
netif_tx_stop_all_queues(priv->netdev);
netif_tx_disable(priv->netdev);
+   write_lock(>stats_lock);
+   priv->channels_active = false;
+   write_unlock(>stats_lock);
mlx5e_deactivate_channels(>channels);
 }
 
@@ -3223,6 +3229,7 @@ mlx5e_get_stats(struct net_device *dev, struct 
rtnl_link_stats64 *stats)
stats->tx_packets = PPORT_802_3_GET(pstats, 
a_frames_transmitted_ok);
stats->tx_bytes   = PPORT_802_3_GET(pstats, 
a_octets_transmitted_ok);
} else {
+   mlx5e_grp_sw_update_stats(priv);
stats->rx_packets = sstats->rx_packets;
stats->rx_bytes   = sstats->rx_bytes;
stats->tx_packets = sstats->tx_packets;
@@ -4248,6 +4255,7 @@ static void mlx5e_build_nic_netdev_priv(struct 
mlx5_core_dev *mdev,
   profile->max_nch(mdev), netdev->mtu);
 
mutex_init(>state_lock);
+   rwlock_init(>stats_lock);
 
INIT_WORK(>update_carrier_work, mlx5e_update_carrier_work);
INIT_WORK(>set_rx_mode_work, mlx5e_set_rx_mode_work);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index c3034f58aa33..1a3f9e091385 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -130,6 +130,10 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv 
*priv)
struct mlx5e_sq_stats *sq_stats;
int i, j;
 
+   read_lock(>stats_lock);
+   if (!priv->channels_active)
+   goto out;
+
memset(s, 0, sizeof(*s));
for (i = 0; i < priv->channels.num; i++) {
struct mlx5e_channel *c = priv->channels.c[i];
@@ -146,12 +150,8 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv 
*priv)
s->tx_bytes += sq_stats->bytes;
}
}
-}
-
-static void mlx5e_rep_update_stats(struct mlx5e_priv *priv)
-{
-   mlx5e_rep_update_sw_counters(priv);
-   mlx5e_rep_update_hw_counters(priv);
+out:
+   read_unlock(>stats_lock);
 }
 
 static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
@@ -871,6 +871,8 @@ mlx5e_get_sw_stats64(const struct net_device *dev,
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_sw_stats *sstats = >stats.sw;
 
+   mlx5e_rep_update_sw_counters(priv);
+
stats->rx_packets = sstats->rx_packets;
stats->rx_bytes   = sstats->rx_bytes;

[for-next 07/12] net/mlx5e: TX, Use actual WQE size for SQ edge fill

2018-05-25 Thread Saeed Mahameed
From: Tariq Toukan 

We fill SQ edge with NOPs to avoid WQEs wrap.
Here, instead of doing that in advance for the maximum possible
WQE size, we do it on-demand using the actual WQE size.
We re-order some parts in mlx5e_sq_xmit to finish the calculation
of WQE size (ds_cnt) before doing any writes to the WQE buffer.

When SQ work queue is fragmented (introduced in an downstream patch),
dealing with WQE wraps becomes more frequent. This change would drastically
reduce the overhead in this case.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet rate of 64B packets, single transmit ring, size 8K.

Before: 14.9 Mpps
After:  15.8 Mpps

Improvement of 6%.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |   4 -
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  27 ++-
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 213 +++---
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib.h |  23 ++
 5 files changed, 178 insertions(+), 92 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 061c4e90692e..3c0f0a0343fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -183,6 +183,7 @@ static inline int mlx5e_get_max_num_channels(struct 
mlx5_core_dev *mdev)
 struct mlx5e_tx_wqe {
struct mlx5_wqe_ctrl_seg ctrl;
struct mlx5_wqe_eth_seg  eth;
+   struct mlx5_wqe_data_seg data[0];
 };
 
 struct mlx5e_rx_wqe {
@@ -374,7 +375,6 @@ struct mlx5e_txqsq {
struct netdev_queue   *txq;
u32sqn;
u8 min_inline_mode;
-   u16edge;
struct device *pdev;
__be32 mkey_be;
unsigned long  state;
@@ -439,7 +439,6 @@ struct mlx5e_icosq {
struct mlx5_wq_cyc wq;
void __iomem  *uar_map;
u32sqn;
-   u16edge;
unsigned long  state;
 
/* control path */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 41f57afc5140..a8b1e43384ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -959,8 +959,6 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
if (err)
goto err_sq_wq_destroy;
 
-   sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5E_ICOSQ_MAX_WQEBBS;
-
return 0;
 
 err_sq_wq_destroy:
@@ -1039,8 +1037,6 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
INIT_WORK(>dim.work, mlx5e_tx_dim_work);
sq->dim.mode = params->tx_cq_moderation.cq_period_mode;
 
-   sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5_SEND_WQE_MAX_WQEBBS;
-
return 0;
 
 err_sq_wq_destroy:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 7fd3ec877ba4..f4d2c8886492 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -383,6 +383,22 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq 
*sq)
return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 }
 
+static inline void mlx5e_fill_icosq_edge(struct mlx5e_icosq *sq,
+struct mlx5_wq_cyc *wq,
+u16 pi)
+{
+   struct mlx5e_sq_wqe_info *edge_wi, *wi = >db.ico_wqe[pi];
+   u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+
+   edge_wi = wi + nnops;
+
+   /* fill sq edge with nops to avoid wqe wrapping two pages */
+   for (; wi < edge_wi; wi++) {
+   wi->opcode = MLX5_OPCODE_NOP;
+   mlx5e_post_nop(wq, sq->sqn, >pc);
+   }
+}
+
 static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
struct mlx5e_mpw_info *wi = >mpwqe.info[ix];
@@ -391,14 +407,15 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 
ix)
struct mlx5_wq_cyc *wq = >wq;
struct mlx5e_umr_wqe *umr_wqe;
u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
-   int err;
u16 pi;
+   int err;
int i;
 
-   /* fill sq edge with nops to avoid wqe wrap around */
-   while ((pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc)) > sq->edge) {
-   sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
-   mlx5e_post_nop(wq, sq->sqn, >pc);
+   pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
+   if (unlikely(pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_size(wq))) {
+   mlx5e_fill_icosq_edge(sq, wq, pi);
+   pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
}
 
umr_wqe = 

[for-next 06/12] net/mlx5e: Use WQ API functions instead of direct fields access

2018-05-25 Thread Saeed Mahameed
From: Tariq Toukan 

Use the WQ API to get the WQ size, and to map a counter
into a WQ entry index.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  9 ++---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 37 +++
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 25 +++--
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   |  8 ++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  | 19 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c5c7a6d687ff..061c4e90692e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -450,7 +450,7 @@ struct mlx5e_icosq {
 static inline bool
 mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n)
 {
-   return (((wq->sz_m1 & (cc - pc)) >= n) || (cc == pc));
+   return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
 }
 
 struct mlx5e_dma_info {
@@ -956,10 +956,9 @@ static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq 
*sq,
  struct mlx5e_tx_wqe **wqe,
  u16 *pi)
 {
-   struct mlx5_wq_cyc *wq;
+   struct mlx5_wq_cyc *wq = >wq;
 
-   wq = >wq;
-   *pi = sq->pc & wq->sz_m1;
+   *pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
*wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
memset(*wqe, 0, sizeof(**wqe));
 }
@@ -967,7 +966,7 @@ static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq 
*sq,
 static inline
 struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
 {
-   u16 pi   = *pc & wq->sz_m1;
+   u16 pi   = mlx5_wq_cyc_ctr2ix(wq, *pc);
struct mlx5e_tx_wqe*wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
struct mlx5_wqe_ctrl_seg   *cseg = >ctrl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cee44c21766c..41f57afc5140 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -836,13 +836,15 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 static void mlx5e_activate_rq(struct mlx5e_rq *rq)
 {
struct mlx5e_icosq *sq = >channel->icosq;
-   u16 pi = sq->pc & sq->wq.sz_m1;
+   struct mlx5_wq_cyc *wq = >wq;
struct mlx5e_tx_wqe *nopwqe;
 
+   u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
set_bit(MLX5E_RQ_STATE_ENABLED, >state);
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
-   nopwqe = mlx5e_post_nop(>wq, sq->sqn, >pc);
-   mlx5e_notify_hw(>wq, sq->pc, sq->uar_map, >ctrl);
+   nopwqe = mlx5e_post_nop(wq, sq->sqn, >pc);
+   mlx5e_notify_hw(wq, sq->pc, sq->uar_map, >ctrl);
 }
 
 static void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
@@ -885,6 +887,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 {
void *sqc_wq   = MLX5_ADDR_OF(sqc, param->sqc, wq);
struct mlx5_core_dev *mdev = c->mdev;
+   struct mlx5_wq_cyc *wq = >wq;
int err;
 
sq->pdev  = c->pdev;
@@ -894,10 +897,10 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->min_inline_mode = params->tx_min_inline_mode;
 
param->wq.db_numa_node = cpu_to_node(c->cpu);
-   err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, >wq, 
>wq_ctrl);
+   err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, wq, >wq_ctrl);
if (err)
return err;
-   sq->wq.db = >wq.db[MLX5_SND_DBR];
+   wq->db = >db[MLX5_SND_DBR];
 
err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
if (err)
@@ -940,22 +943,23 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 {
void *sqc_wq   = MLX5_ADDR_OF(sqc, param->sqc, wq);
struct mlx5_core_dev *mdev = c->mdev;
+   struct mlx5_wq_cyc *wq = >wq;
int err;
 
sq->channel   = c;
sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
param->wq.db_numa_node = cpu_to_node(c->cpu);
-   err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, >wq, 
>wq_ctrl);
+   err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, wq, >wq_ctrl);
if (err)
return err;
-   sq->wq.db = >wq.db[MLX5_SND_DBR];
+   wq->db = >db[MLX5_SND_DBR];
 
err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
 
-   sq->edge = (sq->wq.sz_m1 + 1) - MLX5E_ICOSQ_MAX_WQEBBS;
+   sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5E_ICOSQ_MAX_WQEBBS;
 
return 0;
 
@@ -1005,6 +1009,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 {
void *sqc_wq   = MLX5_ADDR_OF(sqc, param->sqc, wq);
struct mlx5_core_dev *mdev = c->mdev;
+   struct mlx5_wq_cyc *wq 

[for-next 03/12] net/mlx5: E-switch, Create a second level FDB flow table

2018-05-25 Thread Saeed Mahameed
From: Chris Mi 

If firmware supports the forward action with a destination list
that includes a flow table, create a second level FDB flow table.

This is going to be used for flow based mirroring under the switchdev
offloads mode.

Signed-off-by: Chris Mi 
Reviewed-by: Paul Blakey 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  4 +++
 .../mellanox/mlx5/core/eswitch_offloads.c | 31 ---
 .../net/ethernet/mellanox/mlx5/core/fs_core.c |  2 +-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index d1a3f7fcca1c..d06c11629121 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -55,6 +55,9 @@
 #define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
 
+#define mlx5_esw_has_fwd_fdb(dev) \
+   MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
+
 struct vport_ingress {
struct mlx5_flow_table *acl;
struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -127,6 +130,7 @@ struct mlx5_eswitch_fdb {
 
struct offloads_fdb {
struct mlx5_flow_table *fast_fdb;
+   struct mlx5_flow_table *fwd_fdb;
struct mlx5_flow_table *slow_fdb;
struct mlx5_flow_group *send_to_vport_grp;
struct mlx5_flow_group *miss_grp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bb8eac5523a7..8ea11f24380c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -454,7 +454,7 @@ static int esw_create_offloads_fast_fdb_table(struct 
mlx5_eswitch *esw)
if (!root_ns) {
esw_warn(dev, "Failed to get FDB flow namespace\n");
err = -EOPNOTSUPP;
-   goto out;
+   goto out_namespace;
}
 
esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max 
counters(%d)*groups(%d))\n",
@@ -464,6 +464,9 @@ static int esw_create_offloads_fast_fdb_table(struct 
mlx5_eswitch *esw)
esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
+   if (mlx5_esw_has_fwd_fdb(dev))
+   esw_size >>= 1;
+
if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
 
@@ -474,16 +477,36 @@ static int esw_create_offloads_fast_fdb_table(struct 
mlx5_eswitch *esw)
if (IS_ERR(fdb)) {
err = PTR_ERR(fdb);
esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", 
err);
-   goto out;
+   goto out_namespace;
}
esw->fdb_table.offloads.fast_fdb = fdb;
 
-out:
+   if (!mlx5_esw_has_fwd_fdb(dev))
+   goto out_namespace;
+
+   fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
+ esw_size,
+ ESW_OFFLOADS_NUM_GROUPS, 1,
+ flags);
+   if (IS_ERR(fdb)) {
+   err = PTR_ERR(fdb);
+   esw_warn(dev, "Failed to create fwd table err %d\n", err);
+   goto out_ft;
+   }
+   esw->fdb_table.offloads.fwd_fdb = fdb;
+
+   return err;
+
+out_ft:
+   mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+out_namespace:
return err;
 }
 
 static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 {
+   if (mlx5_esw_has_fwd_fdb(esw->dev))
+   mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
 }
 
@@ -588,7 +611,7 @@ static int esw_create_offloads_fdb_tables(struct 
mlx5_eswitch *esw, int nvports)
 send_vport_err:
mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
-   mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+   esw_destroy_offloads_fast_fdb_table(esw);
 fast_fdb_err:
 ns_err:
kvfree(flow_group_in);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 806e95523f9e..f9c2c03083eb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2495,7 +2495,7 @@ static int init_fdb_root_ns(struct mlx5_flow_steering 
*steering)
if (!steering->fdb_root_ns)
   

[for-next 08/12] net/mlx5i: Use compilation flag in IPOIB header

2018-05-25 Thread Saeed Mahameed
From: Tariq Toukan 

If CONFIG_MLX5_CORE_IPOIB is not set, compile-out the
IPOIB related headers.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index 45a11864e544..08eac92fc26c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -33,6 +33,8 @@
 #ifndef __MLX5E_IPOB_H__
 #define __MLX5E_IPOB_H__
 
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
 #include 
 #include "en.h"
 
@@ -120,4 +122,5 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct 
sk_buff *skb,
  struct mlx5_av *av, u32 dqpn, u32 dqkey);
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 
+#endif /* CONFIG_MLX5_CORE_IPOIB */
 #endif /* __MLX5E_IPOB_H__ */
-- 
2.17.0



[for-next 10/12] net/mlx5e: Move phy link down events counter out of SW stats

2018-05-25 Thread Saeed Mahameed
PHY link down events counter belongs to phy_counters group.
although it has special handling, it doesn't mean it can't be there.

Move it to phy_counters_grp handler.

Signed-off-by: Saeed Mahameed 
---
 .../ethernet/mellanox/mlx5/core/en_stats.c| 37 +++
 .../ethernet/mellanox/mlx5/core/en_stats.h|  3 --
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index e17919c0af08..973939ed8bb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -81,7 +81,6 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_busy) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) },
-   { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events_phy) },
 };
 
 #define NUM_SW_COUNTERSARRAY_SIZE(sw_stats_desc)
@@ -175,9 +174,6 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv 
*priv)
}
}
 
-   s->link_down_events_phy = MLX5_GET(ppcnt_reg,
-   priv->stats.pport.phy_counters,
-   counter_set.phys_layer_cntrs.link_down_events);
memcpy(>stats.sw, s, sizeof(*s));
 }
 
@@ -580,12 +576,13 @@ static const struct counter_desc 
pport_phy_statistical_stats_desc[] = {
{ "rx_corrected_bits_phy", 
PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits) },
 };
 
-#define NUM_PPORT_PHY_COUNTERS 
ARRAY_SIZE(pport_phy_statistical_stats_desc)
+#define NUM_PPORT_PHY_STATISTICAL_COUNTERS 
ARRAY_SIZE(pport_phy_statistical_stats_desc)
 
 static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv)
 {
+   /* "1" for link_down_events special counter */
return MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group) ?
-   NUM_PPORT_PHY_COUNTERS : 0;
+   NUM_PPORT_PHY_STATISTICAL_COUNTERS + 1 : 1;
 }
 
 static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data,
@@ -593,10 +590,14 @@ static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv 
*priv, u8 *data,
 {
int i;
 
-   if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
-   for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
-   strcpy(data + (idx++) * ETH_GSTRING_LEN,
-  pport_phy_statistical_stats_desc[i].format);
+   strcpy(data + (idx++) * ETH_GSTRING_LEN, "link_down_events_phy");
+
+   if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+   return idx;
+
+   for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+   strcpy(data + (idx++) * ETH_GSTRING_LEN,
+  pport_phy_statistical_stats_desc[i].format);
return idx;
 }
 
@@ -604,11 +605,17 @@ static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv 
*priv, u64 *data, int idx)
 {
int i;
 
-   if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
-   for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
-   data[idx++] =
-   
MLX5E_READ_CTR64_BE(>stats.pport.phy_statistical_counters,
-   
pport_phy_statistical_stats_desc, i);
+   /* link_down_events_phy has special handling since it is not stored in 
__be64 format */
+   data[idx++] = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters,
+  counter_set.phys_layer_cntrs.link_down_events);
+
+   if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+   return idx;
+
+   for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+   data[idx++] =
+   
MLX5E_READ_CTR64_BE(>stats.pport.phy_statistical_counters,
+   pport_phy_statistical_stats_desc, 
i);
return idx;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index a36e6a87066b..39ced559929a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -97,9 +97,6 @@ struct mlx5e_sw_stats {
u64 tx_tls_ooo;
u64 tx_tls_resync_bytes;
 #endif
-
-   /* Special handling counters */
-   u64 link_down_events_phy;
 };
 
 struct mlx5e_qcounter_stats {
-- 
2.17.0



[for-next 12/12] net/mlx5e: Avoid reset netdev stats on configuration changes

2018-05-25 Thread Saeed Mahameed
From: Eran Ben Elisha 

Move all RQ, SQ and channel counters from the channel objects into the
priv structure.  With this change, counters will not be reset upon
channel configuration changes.

Channel's statistics for SQs which are associated with TCs higher than
zero will be presented in ethtool -S, only for SQs which were opened at
least once since the module was loaded (regardless of their open/close
current status).  This is done in order to decrease the total amount of
statistics presented and calculated for the common out of box use (no
QoS).

mlx5e_channel_stats is a compound of CH,RQ,SQs stats in order to
create locality for the NAPI when handling TX and RX of the same
channel.

Align the new statistics struct per ring to avoid several channels
update to the same cache line at the same time.
Packet rate was tested, no degradation sensed.

Signed-off-by: Eran Ben Elisha 
CC: Qing Huang 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 14 +++-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c|  4 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 28 ---
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |  4 +-
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 75 +++
 .../ethernet/mellanox/mlx5/core/en_stats.c| 56 +++---
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 49 ++--
 .../net/ethernet/mellanox/mlx5/core/en_txrx.c |  6 +-
 8 files changed, 136 insertions(+), 100 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c3c79f2835d2..1c04df043e07 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -358,7 +358,6 @@ struct mlx5e_txqsq {
/* dirtied @xmit */
u16pc cacheline_aligned_in_smp;
u32dma_fifo_pc;
-   struct mlx5e_sq_stats  stats;
 
struct mlx5e_cqcq;
 
@@ -371,6 +370,7 @@ struct mlx5e_txqsq {
/* read only */
struct mlx5_wq_cyc wq;
u32dma_fifo_mask;
+   struct mlx5e_sq_stats *stats;
void __iomem  *uar_map;
struct netdev_queue   *txq;
u32sqn;
@@ -526,7 +526,7 @@ struct mlx5e_rq {
struct mlx5e_channel  *channel;
struct device *pdev;
struct net_device *netdev;
-   struct mlx5e_rq_stats  stats;
+   struct mlx5e_rq_stats *stats;
struct mlx5e_cqcq;
struct mlx5e_page_cache page_cache;
struct hwtstamp_config *tstamp;
@@ -574,7 +574,7 @@ struct mlx5e_channel {
 
/* data path - accessed per napi poll */
struct irq_desc *irq_desc;
-   struct mlx5e_ch_stats  stats;
+   struct mlx5e_ch_stats *stats;
 
/* control */
struct mlx5e_priv *priv;
@@ -590,6 +590,12 @@ struct mlx5e_channels {
struct mlx5e_paramsparams;
 };
 
+struct mlx5e_channel_stats {
+   struct mlx5e_ch_stats ch;
+   struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
+   struct mlx5e_rq_stats rq;
+} cacheline_aligned_in_smp;
+
 enum mlx5e_traffic_types {
MLX5E_TT_IPV4_TCP,
MLX5E_TT_IPV6_TCP,
@@ -793,6 +799,8 @@ struct mlx5e_priv {
struct mlx5_core_dev  *mdev;
struct net_device *netdev;
struct mlx5e_stats stats;
+   struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS];
+   u8 max_opened_tc;
struct hwtstamp_config tstamp;
u16q_counter;
u16drop_rq_q_counter;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
index ad2790fb5966..15aef71d1957 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -174,7 +174,7 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context 
*context,
int headln;
int i;
 
-   sq->stats.tls_ooo++;
+   sq->stats->tls_ooo++;
 
if (mlx5e_tls_get_sync_data(context, tcp_seq, )) {
/* We might get here if a retransmission reaches the driver
@@ -220,7 +220,7 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context 
*context,
skb_shinfo(nskb)->nr_frags = info.nr_frags;
nskb->data_len = info.sync_len;
nskb->len += info.sync_len;
-   sq->stats.tls_resync_bytes += nskb->len;
+   sq->stats->tls_resync_bytes += nskb->len;
mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln,
cpu_to_be64(info.rcd_sn));
mlx5e_sq_xmit(sq, nskb, *wqe, *pi);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 

[for-next 09/12] net/mlx5: Use order-0 allocations for all WQ types

2018-05-25 Thread Saeed Mahameed
From: Tariq Toukan 

Complete the transition of all WQ types to use fragmented
order-0 coherent memory instead of high-order allocations.

CQ-WQ already uses order-0.
Here we do the same for cyclic and linked-list WQs.

This allows the driver to load cleanly on systems with a highly
fragmented coherent memory.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet rate of 64B packets, single transmit ring, size 8K.

No degradation is sensed.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 15 +--
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 17 ++--
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 24 ++---
 .../ethernet/mellanox/mlx5/core/fpga/conn.c   | 14 +--
 .../ethernet/mellanox/mlx5/core/fpga/conn.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c  | 94 ---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  | 33 +++
 include/linux/mlx5/driver.h   | 16 +++-
 9 files changed, 123 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3c0f0a0343fd..9396db54973f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -314,7 +314,7 @@ struct mlx5e_cq {
 
/* control */
struct mlx5_core_dev  *mdev;
-   struct mlx5_frag_wq_ctrl   wq_ctrl;
+   struct mlx5_wq_ctrlwq_ctrl;
 } cacheline_aligned_in_smp;
 
 struct mlx5e_tx_wqe_info {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8b1e43384ca..0c167e5fc346 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -646,8 +646,8 @@ static int mlx5e_create_rq(struct mlx5e_rq *rq,
MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET64(wq, wq,  dbr_addr,   rq->wq_ctrl.db.dma);
 
-   mlx5_fill_page_array(>wq_ctrl.buf,
-(__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+   mlx5_fill_page_frag_array(>wq_ctrl.buf,
+ (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
err = mlx5_core_create_rq(mdev, in, inlen, >rqn);
 
@@ -1096,7 +1096,8 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
  MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET64(wq, wq, dbr_addr,  csp->wq_ctrl->db.dma);
 
-   mlx5_fill_page_array(>wq_ctrl->buf, (__be64 *)MLX5_ADDR_OF(wq, wq, 
pas));
+   mlx5_fill_page_frag_array(>wq_ctrl->buf,
+ (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
err = mlx5_core_create_sq(mdev, in, inlen, sqn);
 
@@ -1538,7 +1539,7 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 
 static void mlx5e_free_cq(struct mlx5e_cq *cq)
 {
-   mlx5_cqwq_destroy(>wq_ctrl);
+   mlx5_wq_destroy(>wq_ctrl);
 }
 
 static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
@@ -1554,7 +1555,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct 
mlx5e_cq_param *param)
int err;
 
inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-   sizeof(u64) * cq->wq_ctrl.frag_buf.npages;
+   sizeof(u64) * cq->wq_ctrl.buf.npages;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in)
return -ENOMEM;
@@ -1563,7 +1564,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct 
mlx5e_cq_param *param)
 
memcpy(cqc, param->cqc, sizeof(param->cqc));
 
-   mlx5_fill_page_frag_array(>wq_ctrl.frag_buf,
+   mlx5_fill_page_frag_array(>wq_ctrl.buf,
  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, 
pas));
 
mlx5_vector2eqn(mdev, param->eq_ix, , _not_used);
@@ -1571,7 +1572,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct 
mlx5e_cq_param *param)
MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
MLX5_SET(cqc,   cqc, c_eqn, eqn);
MLX5_SET(cqc,   cqc, uar_page,  mdev->priv.uar->index);
-   MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
+   MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET64(cqc, cqc, dbr_addr,  cq->wq_ctrl.db.dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f4d2c8886492..ac54380d41e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -383,16 +383,16 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq 
*sq)
return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 }
 
-static 

Re: [PATCH, net-next] qcom-emag: hide ACPI specific functions

2018-05-25 Thread Timur Tabi

On 5/25/18 4:37 PM, Arnd Bergmann wrote:

+#ifdef CONFIG_ACPI
  static int emac_sgmii_irq_clear(struct emac_adapter *adpt, u8 irq_bits)
  {
struct emac_sgmii *phy = >phy;
@@ -288,6 +289,7 @@ static struct sgmii_ops qdf2400_ops = {
.link_change = emac_sgmii_common_link_change,
.reset = emac_sgmii_common_reset,
  };
+#endif


This seems wrong.  The SGMII interrupt handler should still be viable on 
a device-tree system.  There is a DT compatibility entry for the qdf2432.


Looks like that most recent patch on net-next broke DT support, when it 
removed these lines:


-   phy->open = emac_sgmii_open;
-   phy->close = emac_sgmii_close;
-   phy->link_up = emac_sgmii_link_up;
-   phy->link_down = emac_sgmii_link_down;

I'll take it look at it next week when I'm back in the office.

--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


Re: aio poll and a new in-kernel poll API V13

2018-05-25 Thread Al Viro
On Wed, May 23, 2018 at 09:19:49PM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this series adds support for the IOCB_CMD_POLL operation to poll for the
> readyness of file descriptors using the aio subsystem.  The API is based
> on patches that existed in RHAS2.1 and RHEL3, which means it already is
> supported by libaio.  To implement the poll support efficiently new
> methods to poll are introduced in struct file_operations:  get_poll_head
> and poll_mask.  The first one returns a wait_queue_head to wait on
> (lifetime is bound by the file), and the second does a non-blocking
> check for the POLL* events.  This allows aio poll to work without
> any additional context switches, unlike epoll.
> 
> This series sits on top of the aio-fsync series that also includes
> support for io_pgetevents.

OK, I can live with that, except for one problem - the first patch shouldn't
be sitting on top of arseloads of next window fodder.

Please, rebase the rest of the series on top of merge of vfs.git#fixes
(4faa99965e02) with your aio-fsync.4 and tell me what to pull.


Confusion about (new?) bridge behavior

2018-05-25 Thread Florian Fainelli
Hi,

The DSA b53 driver currently (well, not quite currently, but at some
point) was forcing the CPU port VLAN membership to be tagged. The reason
for that is mostly because if we allow the CPU port to be untagged in
multiple VLANs we can't quite properly separate traffic at the CPU port
level, so we don't do that.

At some point v4.12, or maybe as far back as 4.9, I remember the
following from happening:

- it was not necessary to turn on vlan_filtering on the bridge to get
the DSA layer to program VLAN entries into the switch, I only get those
calls now if vlan_filtering is enabled, this is not a big issue, but I
do wonder if something did change here? Could be a DSA specific issue
here, I can bisect that later

- because the CPU port was configured tagged into VLAN 1, it was
necessary to either change the default bridge master device settings
(default being VID 1 pvid untagged) to pop the VLAN tag, and conversely
push the VLAN tag on egress path and/or create a network device such
that would be responsible for terminating the VLAN tag, e.g: br0.1

But now, this does not seem to be necessary and I just can't explain
why, it might very well be a switch driver configuration issue.

Now what I am seeing is the following:

1) Default configuration with vlan_filtering = 1

# bridge vlan show
portvlan ids
gphy 1 PVID Egress Untagged

br0  1 PVID Egress Untagged

If I look at the CPU port, I can see that frames from switch to CPU have
VLAN tag 1, which is expected, yet frames that are from CPU to switch do
not have such a VLAN tag 1, and yet things work okay, dhcp + ping using
br0 work just fine. I suppose that is because of the pvid behavior which
assigns untagged frames to the default VLAN which happens to be 1 in
that case.

2) Changing the bridge to be in VLAN 1 no pvid, no untagged, still with
vlan_filtering = 1

# bridge vlan add vid 1 dev br0 self
# bridge vlan show
portvlan ids
gphy 1 PVID Egress Untagged

br0  1

The frames from switch to CPU still have VLAN tag 1, now I need to
create a br0.1 device to insert the VLAN tag for dhcp + ping to work
using br0.1 in that case, though I still do not see the VLAN tag being
present for frames that are from CPU to switch...

Was I hallucinating before and things were equally broken, or did
something change recently?

Thank you!
-- 
Florian




Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework

2018-05-25 Thread Stephen Hemminger
On Fri, 25 May 2018 16:11:47 -0700
"Samudrala, Sridhar"  wrote:

> On 5/25/2018 3:34 PM, Stephen Hemminger wrote:
> > On Thu, 24 May 2018 09:55:14 -0700
> > Sridhar Samudrala  wrote:
> >  
> >> --- a/drivers/net/hyperv/Kconfig
> >> +++ b/drivers/net/hyperv/Kconfig
> >> @@ -2,5 +2,6 @@ config HYPERV_NET
> >>tristate "Microsoft Hyper-V virtual network driver"
> >>depends on HYPERV
> >>select UCS2_STRING
> >> +  select FAILOVER  
> > When I take a working kernel config, add the patches then do
> > make oldconfig
> >
> > It is not autoselecting FAILOVER, it prompts me for it. This means
> > if user says no then a non-working netvsc device is made.  
> 
> I see
> Generic failover module (FAILOVER) [M/y/?] (NEW)
> 
> So the user is given an option to either build as a Module or part of the
> kernel. 'n' is not an option.

With most libraries there is no prompt at all.


Re: [PATCH, net-next] net/mlx5e: fix TLS dependency

2018-05-25 Thread Saeed Mahameed
On Fri, 2018-05-25 at 23:36 +0200, Arnd Bergmann wrote:
> With CONFIG_TLS=m and MLX5_CORE_EN=y, we get a link failure:
> 
> drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In
> function `mlx5e_tls_handle_ooo':
> tls_rxtx.c:(.text+0x24c): undefined reference to `tls_get_record'
> drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In
> function `mlx5e_tls_handle_tx_skb':
> tls_rxtx.c:(.text+0x9a8): undefined reference to
> `tls_device_sk_destruct'
> 
> This narrows down the dependency to only allow the configurations
> that will actually work. The existing dependency on TLS_DEVICE is
> not sufficient here since MLX5_EN_TLS is a 'bool' symbol.
> 
> Fixes: c83294b9efa5 ("net/mlx5e: TLS, Add Innova TLS TX support")
> Signed-off-by: Arnd Bergmann 
> ---

LGTM

Acked-by: Saeed Mahameed 

Thank you Arnd!


>  drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> index ee6684779d11..2545296a0c08 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
> @@ -91,6 +91,7 @@ config MLX5_EN_TLS
>   bool "TLS cryptography-offload accelaration"
>   depends on MLX5_CORE_EN
>   depends on TLS_DEVICE
> + depends on TLS=y || MLX5_CORE=m
>   depends on MLX5_ACCEL
>   default n
>   ---help---

Re: mmotm 2018-05-25-14-52 uploaded (drivers/net/ethernet/ti/davinci_mdio.c)

2018-05-25 Thread Randy Dunlap
[forgot to add netdev]

On 05/25/2018 04:14 PM, Randy Dunlap wrote:
> On 05/25/2018 02:52 PM, a...@linux-foundation.org wrote:
>> The mm-of-the-moment snapshot 2018-05-25-14-52 has been uploaded to
>>
>>http://www.ozlabs.org/~akpm/mmotm/
>>
>> mmotm-readme.txt says
>>
>> README for mm-of-the-moment:
>>
>> http://www.ozlabs.org/~akpm/mmotm/
>>
>> This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
>> more than once a week.
> 
> on x86_64:
> # CONFIG_OF is not set
> 
>   CC  drivers/net/ethernet/ti/davinci_cpdma.o
> ../drivers/net/ethernet/ti/davinci_mdio.c: In function 'davinci_mdio_probe':
> ../drivers/net/ethernet/ti/davinci_mdio.c:380:3: error: implicit declaration 
> of function 'davinci_mdio_probe_dt' [-Werror=implicit-function-declaration]
>ret = davinci_mdio_probe_dt(>pdata, pdev);
> 
> 
> 
>>
>> You will need quilt to apply these patches to the latest Linus release (4.x
>> or 4.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
>> http://ozlabs.org/~akpm/mmotm/series
>>
>> The file broken-out.tar.gz contains two datestamp files: .DATE and
>> .DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
>> followed by the base kernel version against which this patch series is to
>> be applied.
>>
>> This tree is partially included in linux-next.  To see which patches are
>> included in linux-next, consult the `series' file.  Only the patches
>> within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
>> linux-next.
>>
>> A git tree which contains the memory management portion of this tree is
>> maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
>> by Michal Hocko.  It contains the patches which are between the
>> "#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series
>> file, http://www.ozlabs.org/~akpm/mmotm/series.
>>
>>
>> A full copy of the full kernel tree with the linux-next and mmotm patches
>> already applied is available through git within an hour of the mmotm
>> release.  Individual mmotm releases are tagged.  The master branch always
>> points to the latest release, so it's constantly rebasing.
>>
>> http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/
>>
>> To develop on top of mmotm git:
>>
>>   $ git remote add mmotm 
>> git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
>>   $ git remote update mmotm
>>   $ git checkout -b topic mmotm/master
>>   
>>   $ git send-email mmotm/master.. [...]
>>
>> To rebase a branch with older patches to a new mmotm release:
>>
>>   $ git remote update mmotm
>>   $ git rebase --onto mmotm/master  topic
>>
>>
>>
>>
>> The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
>> contains daily snapshots of the -mm tree.  It is updated more frequently
>> than mmotm, and is untested.
>>
>> A git copy of this tree is available at
>>
>>  http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/
>>
>> and use of this tree is similar to
>> http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above.
>>
>>
>> This mmotm tree contains the following patches against 4.17-rc6:
>> (patches marked "*" will be included in linux-next)
>>
> 
> 


-- 
~Randy


Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework

2018-05-25 Thread Samudrala, Sridhar


On 5/25/2018 3:34 PM, Stephen Hemminger wrote:

On Thu, 24 May 2018 09:55:14 -0700
Sridhar Samudrala  wrote:


--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -2,5 +2,6 @@ config HYPERV_NET
tristate "Microsoft Hyper-V virtual network driver"
depends on HYPERV
select UCS2_STRING
+   select FAILOVER

When I take a working kernel config, add the patches then do
make oldconfig

It is not autoselecting FAILOVER, it prompts me for it. This means
if user says no then a non-working netvsc device is made.


I see
   Generic failover module (FAILOVER) [M/y/?] (NEW)

So the user is given an option to either build as a Module or part of the
kernel. 'n' is not an option.




Re: [PATCH net-next v12 1/5] net: Introduce generic failover module

2018-05-25 Thread Samudrala, Sridhar


On 5/25/2018 3:38 PM, Stephen Hemminger wrote:

On Thu, 24 May 2018 09:55:13 -0700
Sridhar Samudrala  wrote:


diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..0f4ba52b641d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1421,6 +1421,8 @@ struct net_device_ops {
   *entity (i.e. the master device for bridged veth)
   * @IFF_MACSEC: device is a MACsec device
   * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
   */
  enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
@@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
IFF_PHONY_HEADROOM  = 1<<24,
IFF_MACSEC  = 1<<25,
IFF_NO_RX_HANDLER   = 1<<26,
+   IFF_FAILOVER= 1<<27,
+   IFF_FAILOVER_SLAVE  = 1<<28,
  };

Why is FAILOVER any different than other master/slave relationships.
I don't think you need to take up precious netdev flag bits for this.


These are netdev priv flags.
Jiri says that IFF_MASTER/IFF_SLAVE are bonding specific flags and cannot be 
used
with other failover mechanisms. Team also doesn't use this flags and it has its 
own
priv_flags.



Re: [PATCH net-next v12 1/5] net: Introduce generic failover module

2018-05-25 Thread Stephen Hemminger
On Thu, 24 May 2018 09:55:13 -0700
Sridhar Samudrala  wrote:

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 03ed492c4e14..0f4ba52b641d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1421,6 +1421,8 @@ struct net_device_ops {
>   *   entity (i.e. the master device for bridged veth)
>   * @IFF_MACSEC: device is a MACsec device
>   * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
> + * @IFF_FAILOVER: device is a failover master device
> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
>   */
>  enum netdev_priv_flags {
>   IFF_802_1Q_VLAN = 1<<0,
> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
>   IFF_PHONY_HEADROOM  = 1<<24,
>   IFF_MACSEC  = 1<<25,
>   IFF_NO_RX_HANDLER   = 1<<26,
> + IFF_FAILOVER= 1<<27,
> + IFF_FAILOVER_SLAVE  = 1<<28,
>  };

Why is FAILOVER any different than other master/slave relationships.
I don't think you need to take up precious netdev flag bits for this.


Re: [PATCH net-next v12 1/5] net: Introduce generic failover module

2018-05-25 Thread Stephen Hemminger
On Thu, 24 May 2018 09:55:13 -0700
Sridhar Samudrala  wrote:


> + spin_lock(_lock);

Since register is not in fast path, this should be a mutex?


> +int failover_slave_unregister(struct net_device *slave_dev)
> +{
> + struct net_device *failover_dev;
> + struct failover_ops *fops;
> +
> + if (!netif_is_failover_slave(slave_dev))
> + goto done;
> +
> + ASSERT_RTNL();
> +
> + failover_dev = failover_get_bymac(slave_dev->perm_addr, );
> + if (!failover_dev)
> + goto done;

Since the slave device must have a master device set already, why not use
that instead of searching by MAC address on unregister or link change.



Re: [PATCH net-next v12 2/5] netvsc: refactor notifier/event handling code to use the failover framework

2018-05-25 Thread Stephen Hemminger
On Thu, 24 May 2018 09:55:14 -0700
Sridhar Samudrala  wrote:

> --- a/drivers/net/hyperv/Kconfig
> +++ b/drivers/net/hyperv/Kconfig
> @@ -2,5 +2,6 @@ config HYPERV_NET
>   tristate "Microsoft Hyper-V virtual network driver"
>   depends on HYPERV
>   select UCS2_STRING
> + select FAILOVER

When I take a working kernel config, add the patches then do
make oldconfig

It is not autoselecting FAILOVER, it prompts me for it. This means
if user says no then a non-working netvsc device is made.


[PATCH] mwifiex: mark expected switch fall-throughs

2018-05-25 Thread Gustavo A. R. Silva
In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c | 4 
 drivers/net/wireless/marvell/mwifiex/scan.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c 
b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 54a2297..16a705d 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1158,6 +1158,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
case NL80211_IFTYPE_UNSPECIFIED:
mwifiex_dbg(priv->adapter, INFO,
"%s: kept type as IBSS\n", dev->name);
+   /* fall through */
case NL80211_IFTYPE_ADHOC:  /* This shouldn't happen */
return 0;
default:
@@ -1188,6 +1189,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
case NL80211_IFTYPE_UNSPECIFIED:
mwifiex_dbg(priv->adapter, INFO,
"%s: kept type as STA\n", dev->name);
+   /* fall through */
case NL80211_IFTYPE_STATION:/* This shouldn't happen */
return 0;
default:
@@ -1210,6 +1212,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
case NL80211_IFTYPE_UNSPECIFIED:
mwifiex_dbg(priv->adapter, INFO,
"%s: kept type as AP\n", dev->name);
+   /* fall through */
case NL80211_IFTYPE_AP: /* This shouldn't happen */
return 0;
default:
@@ -1249,6 +1252,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
case NL80211_IFTYPE_UNSPECIFIED:
mwifiex_dbg(priv->adapter, INFO,
"%s: kept type as P2P\n", dev->name);
+   /* fall through */
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_P2P_GO:
return 0;
diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c 
b/drivers/net/wireless/marvell/mwifiex/scan.c
index d7ce7f7..19df92b 100644
--- a/drivers/net/wireless/marvell/mwifiex/scan.c
+++ b/drivers/net/wireless/marvell/mwifiex/scan.c
@@ -1308,6 +1308,7 @@ int mwifiex_update_bss_desc_with_ie(struct 
mwifiex_adapter *adapter,
 
case WLAN_EID_CHANNEL_SWITCH:
bss_entry->chan_sw_ie_present = true;
+   /* fall through */
case WLAN_EID_PWR_CAPABILITY:
case WLAN_EID_TPC_REPORT:
case WLAN_EID_QUIET:
-- 
2.7.4



Re: [PATCH v2 net-next] tcp: use data length instead of skb->len in tcp_probe

2018-05-25 Thread Song Liu


> On May 25, 2018, at 3:14 AM, Yafang Shao  wrote:
> 
> skb->len is meaningless to user.
> data length could be more helpful, with which we can easily filter out
> the packet without payload.
> 
> Signed-off-by: Yafang Shao 

Acked-by: Song Liu 


> ---
> include/trace/events/tcp.h | 8 
> 1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index c1a5284..703abb6 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -236,7 +236,7 @@
>   __field(__u16, sport)
>   __field(__u16, dport)
>   __field(__u32, mark)
> - __field(__u16, length)
> + __field(__u16, data_len)
>   __field(__u32, snd_nxt)
>   __field(__u32, snd_una)
>   __field(__u32, snd_cwnd)
> @@ -261,7 +261,7 @@
>   __entry->dport = ntohs(inet->inet_dport);
>   __entry->mark = skb->mark;
> 
> - __entry->length = skb->len;
> + __entry->data_len = skb->len - tcp_hdrlen(skb);
>   __entry->snd_nxt = tp->snd_nxt;
>   __entry->snd_una = tp->snd_una;
>   __entry->snd_cwnd = tp->snd_cwnd;
> @@ -272,9 +272,9 @@
>   __entry->sock_cookie = sock_gen_cookie(sk);
>   ),
> 
> - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> + TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> __entry->saddr, __entry->daddr, __entry->mark,
> -   __entry->length, __entry->snd_nxt, __entry->snd_una,
> +   __entry->data_len, __entry->snd_nxt, __entry->snd_una,
> __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
> __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
> );
> -- 
> 1.8.3.1
> 



Re: [PATCH bpf-next] libbpf: Install btf.h with libbpf

2018-05-25 Thread Song Liu
On Fri, May 25, 2018 at 10:33 AM, Martin KaFai Lau  wrote:
> On Fri, May 25, 2018 at 10:23:13AM -0700, Andrey Ignatov wrote:
>> install_headers target should contain all headers that are part of
>> libbpf. Add missing btf.h
>>
>> Signed-off-by: Andrey Ignatov 
> Acked-by: Martin KaFai Lau 

Acked-by: Song Liu 


Re: [PATCH, net-next 2/2] bpf: avoid -Wmaybe-uninitialized warning

2018-05-25 Thread Song Liu

> On May 25, 2018, at 2:33 PM, Arnd Bergmann  wrote:
> 
> The stack_map_get_build_id_offset() function is too long for gcc to track
> whether 'work' may or may not be initialized at the end of it, leading
> to a false-positive warning:
> 
> kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset':
> kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this 
> function [-Werror=maybe-uninitialized]
> 
> This removes the 'in_nmi_ctx' flag and uses the state of that variable
> itself to see if it got initialized.
> 
> Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context")
> Signed-off-by: Arnd Bergmann 
> ---
> kernel/bpf/stackmap.c | 7 +++
> 1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index b59ace0f0f09..b675a3f3d141 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct 
> bpf_stack_build_id *id_offs,
> {
>   int i;
>   struct vm_area_struct *vma;
> - bool in_nmi_ctx = in_nmi();
>   bool irq_work_busy = false;
> - struct stack_map_irq_work *work;
> + struct stack_map_irq_work *work = NULL;
> 
> - if (in_nmi_ctx) {
> + if (in_nmi()) {
>   work = this_cpu_ptr(_read_work);
>   if (work->irq_work.flags & IRQ_WORK_BUSY)
>   /* cannot queue more up_read, fallback */
> @@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct 
> bpf_stack_build_id *id_offs,
>   id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
>   }
> 
> - if (!in_nmi_ctx) {
> + if (!work) {
>   up_read(>mm->mmap_sem);
>   } else {
>   work->sem = >mm->mmap_sem;
> -- 
> 2.9.0
> 

Acked-by: Song Liu 

Re: [PATCH, net-next 1/2] bpf: btf: avoid -Wreturn-type warning

2018-05-25 Thread Song Liu


> On May 25, 2018, at 2:33 PM, Arnd Bergmann  wrote:
> 
> gcc warns about a noreturn function possibly returning in
> some configurations:
> 
> kernel/bpf/btf.c: In function 'env_type_is_resolve_sink':
> kernel/bpf/btf.c:729:1: error: control reaches end of non-void function 
> [-Werror=return-type]
> 
> Using BUG() instead of BUG_ON() avoids that warning and otherwise
> does the exact same thing.
> 
> Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
> Signed-off-by: Arnd Bergmann 
> ---
> kernel/bpf/btf.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 9cbeabb5aca3..2822a0cf4f48 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct 
> btf_verifier_env *env,
>   !btf_type_is_array(next_type) &&
>   !btf_type_is_struct(next_type);
>   default:
> - BUG_ON(1);
> + BUG();
>   }
> }
> 
> -- 
> 2.9.0
> 

Acked-by: Song Liu 




Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-25 Thread Bjorn Helgaas
On Fri, May 25, 2018 at 02:05:21PM -0700, Jakub Kicinski wrote:
> On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:
> > On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:  
> > > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:  
> > > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > > to not fail, e.g.:
> > > > > 
> > > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > > 
> > > > > For devices which VF support depends on loaded FW we have the
> > > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > > to 0.  Remove the special values completely and simply initialize
> > > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > > Add a helper for drivers to reset the VF limit back to total.
> > > > 
> > > > I still can't really make sense out of the changelog.
> > > >
> > > > I think part of the reason it's confusing is because there are two
> > > > things going on:
> > > > 
> > > >   1) You want this:
> > > >   
> > > >pci_sriov_set_totalvfs(dev, 0);
> > > >x = pci_sriov_get_totalvfs(dev) 
> > > > 
> > > >  to return 0 instead of total_VFs.  That seems to connect with
> > > >  your subject line.  It means "sriov_totalvfs" in sysfs could be
> > > >  0, but I don't know how that is useful (I'm sure it is; just
> > > >  educate me :))  
> > > 
> > > Let me just quote the bug report that got filed on our internal bug
> > > tracker :)
> > > 
> > >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> > >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> > >   then tries to set that as the sriov_numvfs parameter.
> > > 
> > >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> > >   but it's set to max.  When FW is switched to flower*, the correct 
> > >   sriov_totalvfs value is presented.
> > > 
> > > * flower is a project name  
> > 
> > From the point of view of the PCI core (which knows nothing about
> > device firmware and relies on the architected config space described
> > by the PCIe spec), this sounds like an erratum: with some firmware
> > installed, the device is not capable of SR-IOV, but still advertises
> > an SR-IOV capability with "TotalVFs > 0".
> > 
> > Regardless of whether that's an erratum, we do allow PF drivers to use
> > pci_sriov_set_totalvfs() to limit the number of VFs that may be
> > enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> Think more of an FPGA which can be reprogrammed at runtime to have
> different capabilities than an erratum.  Some FWs simply have no use
> for VFs and save resources (and validation time) by not supporting it.

This is a bit of a gray area.  Reloading firmware or reprogramming an
FPGA has the potential to create a new and different device than we
had before, but the PCI core doesn't know that.  The typical sequence
is:

  - PCI core enumerates device
  - driver binds to device (we call .probe())
  - driver loads new firmware to device
  - driver resets device with pci_reset_function() or similar
  - pci_reset_function() saves config space
  - pci_reset_function() resets device
  - device uses new firmware when it comes out of reset
  - pci_reset_function() restores config space

Loading the new firmware might change what the device looks like in
config space -- it could change the number or size of BARs, the
capabilities advertised, etc.  We currently sweep that under the rug
and blindly restore the old config space.

It looks like your driver does the reset differently, so maybe it
keeps the original config space setup.

But all that said, I agree that we should allow a PF driver to prevent
VF enablement, whether because the firmware doesn't support it or the
PF driver just wants to prevent use of VFs for whatever reason (maybe
we don't have enough MMIO resources, we don't need the VFs, etc.)

> Okay, perfect.  That makes sense.  The patch below certainly fixes the
> first issue for us.  Thank you!
> 
> As far as the second issue goes - agreed, having the core reset the
> number of VFs to total_VFs definitely makes sense.  It doesn't cater to
> the case where FW is reloaded without reprobing, but we don't do this
> today anyway.
> 
> Should I try to come up with a patch to reset total_VFs after detach?

Yes, please.

Bjorn


[RFC PATCH 2/2] net: macb: Disable TX checksum offloading on all Zynq

2018-05-25 Thread Jennifer Dahm
The Zynq ethernet hardware has checksum offloading bugs that cause
small UDP packets (<= 2 bytes) to be sent with an incorrect checksum
(0x) and forwarded UDP packets to be re-checksummed, which is
illegal behavior. The best solution we have right now is to disable
hardware TX checksum offloading entirely.

Signed-off-by: Jennifer Dahm 
---
 drivers/net/ethernet/cadence/macb_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index a5d564b..e8cc68a 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3807,7 +3807,8 @@ static const struct macb_config zynqmp_config = {
 };
 
 static const struct macb_config zynq_config = {
-   .caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF,
+   .caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF
+ | MACB_CAPS_DISABLE_TX_HW_CSUM,
.dma_burst_length = 16,
.clk_init = macb_clk_init,
.init = macb_init,
-- 
2.7.4



[RFC PATCH 1/2] net: macb: Add CAP to disable hardware TX checksum offloading

2018-05-25 Thread Jennifer Dahm
Certain PHYs have significant bugs in their TX checksum offloading
that cannot be solved in software. In order to accommodate these PHYS,
add a CAP to disable this hardware.

Signed-off-by: Jennifer Dahm 
---
 drivers/net/ethernet/cadence/macb.h  | 1 +
 drivers/net/ethernet/cadence/macb_main.c | 8 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index 8665982..6b85e97 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -635,6 +635,7 @@
 #define MACB_CAPS_USRIO_DISABLED   0x0010
 #define MACB_CAPS_JUMBO0x0020
 #define MACB_CAPS_GEM_HAS_PTP  0x0040
+#define MACB_CAPS_DISABLE_TX_HW_CSUM   0x0080
 #define MACB_CAPS_FIFO_MODE0x1000
 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE   0x2000
 #define MACB_CAPS_SG_DISABLED  0x4000
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 3e93df5..a5d564b 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3360,8 +3360,12 @@ static int macb_init(struct platform_device *pdev)
dev->hw_features |= MACB_NETIF_LSO;
 
/* Checksum offload is only available on gem with packet buffer */
-   if (macb_is_gem(bp) && !(bp->caps & MACB_CAPS_FIFO_MODE))
-   dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+   if (macb_is_gem(bp) && !(bp->caps & MACB_CAPS_FIFO_MODE)) {
+   if (!(bp->caps & MACB_CAPS_DISABLE_TX_HW_CSUM))
+   dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+   else
+   dev->hw_features |= NETIF_F_RXCSUM;
+   }
if (bp->caps & MACB_CAPS_SG_DISABLED)
dev->hw_features &= ~NETIF_F_SG;
dev->features = dev->hw_features;
-- 
2.7.4



[RFC PATCH 0/2] net: macb: Disable TX checksum offloading on all Zynq

2018-05-25 Thread Jennifer Dahm
During testing, I discovered that the Zynq GEM hardware overwrites all
outgoing UDP packet checksums, which is illegal in packet forwarding
cases. This happens both with and without the checksum-zeroing
behavior  introduced  in  007e4ba3ee137f4700f39aa6dbaf01a71047c5f6
("net: macb: initialize checksum when using checksum offloading"). The
only solution to both the small packet bug and the packet forwarding
bug that I can find is to disable TX checksum offloading entirely.

There's still the possibility that these bugs are actually with the
driver software and not with the hardware. I've found several places
where the checksum is set to 0x (the incorrect checksum found in
small packets) when something goes wrong, and I can imagine a buggy
driver writing over the checksum blindly when TX checksum offloading
is enabled.

I would like feedback on two things:
1. Is it possible that the two bugs described above are caused by the
   driver and not by the hardware? If so, where should I look to
   implicate the driver?
2. Is this a problem we care enough about to completely disable TX
   checksum offloading?

Here is the testing procedure I used to reproduce these bugs on my
machine. Specifically, without this patchset, step 9 fails. Without
007e4ba3ee, step 8 also fails.

1. Set up the test environment:
  a. Acquire a Zynq device with two ethernet ports. This is the DUT.
  b. Acquire a USB-Ethernet adapter.
  c. Acquire two ethernet cables.
  d. Connect one Ethernet port on the DUT to your computer's network
 switch.
  e. Connect the other Ethernet port to the USB-Ethernet adapter and
 plug that adapter into your computer.
  f. Set up a Linux VM to send packets through the DUT. I recommend
 using a VM here so that you can easily detach it from the primary
 network to force outgoing traffic through the DUT.
  g. Set up a computer with a packet inspecting program to receive and
 inspect packets. This doesn't need to be a VM. For the purposes
 of this test, I'll be using a Windows instance with WireShark.
2. Load the kernel you want to test onto the DUT, making sure to
   include the `bridge` module.
3. Set up a bridge on the DUT. The following commands on the DUT
   should work, replacing `eth0` and `eth1` with the two ethernet
   interfaces on the DUT:
   ```
   brctl addbr test
   brctl addif test eth0 eth1
   ifconfig eth0 0.0.0.0
   ifconfig eth1 0.0.0.0
   dhclient test -v
   ```
4. Disconnect the Linux VM from your host computer's network and
   connect it to the USB-Ethernet adapter in order to force outgoing
   network traffic through the DUT. If necessary, run dhclient on the
   Linux VM to acquire an IP address.
5. Ensure that you can reach your Windows instance from your Linux VM
   through the DUT (e.g. ping).
6. Start WireShark on your Windows instance and start monitoring
   traffic on a specific, unused port (e.g. 61557).
7. Using netcat, send a few not-tiny UDP packets from your Linux VM to
   your Windows instance to ensure that valid UDP packets are properly
   forwarded. Ex:
   ```
   echo "hello world" | netcat -u  61557
   ```
   Inspect these packets to ensure that the data arrived intact and
   that the checksum looks reasonable (i.e. not 0x or 0x).
8. Using netcat, send a few tiny UDP packets (2 bytes or fewer) from
   Linux VM to your Windows instance to ensure that the checksum is
   reasonable. Ex:
   ```
   echo "h" | netcat -u  61557
   ```
9. Using a custom program, send UDP packets with broken checksums
   (e.g. 0xABCD) from your Linux VM to your Windows instance. Inspect
   these packets with WireShark and make sure that the packet arrived
   with the same checksum you sent it with.

For step 9, I wrote a C program using the Linux socket API that will
send a properly formatted UDP packet with the payload "Hello!" and a
(broken) checksum of 0xABCD to port 61557 on the host provided at the
command line. I can send the full program if you would like, but here
is the important part of it:
```
struct custom_udp {
int16_t s_port;
int16_t d_port;
int16_t length;
int16_t check;
char data[];
};

int send_message(int sockfd, in_port_t port, const char *message) {
struct custom_udp *frame;
int16_t message_len;
int16_t frame_len;
int ret;

message_len = strlen(message) * sizeof(char);
frame_len = sizeof(struct custom_udp) + message_len;
frame = malloc(frame_len);
frame->s_port = htons(0);
frame->d_port = htons(port);
frame->length = htons(frame_len);
frame->check = htons(0xABCD);
memmove(frame->data, message, message_len);

ret = write(sockfd, frame, frame_len);
free(frame);

return ret;
}
```

Jennifer Dahm (1):
  net/macb: Disable TX checksum offloading on all Zynq-7000

 drivers/net/ethernet/cadence/macb.h  |  1 +
 drivers/net/ethernet/cadence/macb_main.c | 11 ---
 2 files changed, 9 

[PATCH] ath9k: mark expected switch fall-throughs

2018-05-25 Thread Gustavo A. R. Silva
In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/wireless/ath/ath9k/ar5008_phy.c | 2 ++
 drivers/net/wireless/ath/ath9k/ar9002_phy.c | 1 +
 drivers/net/wireless/ath/ath9k/main.c   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/net/wireless/ath/ath9k/ar5008_phy.c 
b/drivers/net/wireless/ath/ath9k/ar5008_phy.c
index 7922550..ef2dd68 100644
--- a/drivers/net/wireless/ath/ath9k/ar5008_phy.c
+++ b/drivers/net/wireless/ath/ath9k/ar5008_phy.c
@@ -583,12 +583,14 @@ static void ar5008_hw_init_chain_masks(struct ath_hw *ah)
case 0x5:
REG_SET_BIT(ah, AR_PHY_ANALOG_SWAP,
AR_PHY_SWAP_ALT_CHAIN);
+   /* fall through */
case 0x3:
if (ah->hw_version.macVersion == AR_SREV_REVISION_5416_10) {
REG_WRITE(ah, AR_PHY_RX_CHAINMASK, 0x7);
REG_WRITE(ah, AR_PHY_CAL_CHAINMASK, 0x7);
break;
}
+   /* else: fall through */
case 0x1:
case 0x2:
case 0x7:
diff --git a/drivers/net/wireless/ath/ath9k/ar9002_phy.c 
b/drivers/net/wireless/ath/ath9k/ar9002_phy.c
index 61a9b85..7132918 100644
--- a/drivers/net/wireless/ath/ath9k/ar9002_phy.c
+++ b/drivers/net/wireless/ath/ath9k/ar9002_phy.c
@@ -119,6 +119,7 @@ static int ar9002_hw_set_channel(struct ath_hw *ah, struct 
ath9k_channel *chan)
aModeRefSel = 2;
if (aModeRefSel)
break;
+   /* else: fall through */
case 1:
default:
aModeRefSel = 0;
diff --git a/drivers/net/wireless/ath/ath9k/main.c 
b/drivers/net/wireless/ath/ath9k/main.c
index a3be8ad..11d84f4 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1928,6 +1928,7 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
case IEEE80211_AMPDU_TX_STOP_FLUSH:
case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
flush = true;
+   /* fall through */
case IEEE80211_AMPDU_TX_STOP_CONT:
ath9k_ps_wakeup(sc);
ath_tx_aggr_stop(sc, sta, tid);
-- 
2.7.4



Re: [PATCH 00/14] Modify action API for implementing lockless actions

2018-05-25 Thread Cong Wang
On Fri, May 25, 2018 at 1:39 PM, Vlad Buslov  wrote:
>
> On Thu 24 May 2018 at 23:34, Cong Wang  wrote:
>> On Mon, May 14, 2018 at 7:27 AM, Vlad Buslov  wrote:
>>> Currently, all netlink protocol handlers for updating rules, actions and
>>> qdiscs are protected with single global rtnl lock which removes any
>>> possibility for parallelism. This patch set is a first step to remove
>>> rtnl lock dependency from TC rules update path. It updates act API to
>>> use atomic operations, rcu and spinlocks for fine-grained locking. It
>>> also extend API with functions that are needed to update existing
>>> actions for parallel execution.
>>
>> Can you give a summary here for what and how it is achieved?
>
> Got it, will expand cover letter in V2 with summary.
>>
>> You said this is the first step, what do you want to achieve in this
>> very first step? And how do you achieve it? Do you break the RTNL
>
> But aren't this questions answered in paragraph you quoted?


Obviously not, you said to remove it, but never explains why it can
be removed and how it is removed. This is crucial for review.

"use atomic operations, rcu and spinlocks for fine-grained locking"
is literately nothing, why atomic/rcu makes RTNL unnecessary?
How RCU is used? What spinlocks are you talking about? What
do these spinlocks protect after removing RTNL? Why are they
safe with other netdevice and netns operations?

You explain _nothing_ here. Really. Please don't force people to
read 14 patches to understand how it works. In fact, no one wants
to read the code unless there is some high-level explanation that
makes basic sense.


> What: Change act API to not rely on one-big-global-RTNL-lock and to use
> more fine-grained synchronization methods to allow safe concurrent
> execution.

Sure, how fine-grained it is after your patchset? Why this fine-grained
lock could safely replace RTNL?

Could you stop letting us guess your puzzle words? It would save your
time from exchanging emails with me, it would save my time from
guessing you too. It is a win-win.


> How: Refactor act API code to use atomics, rcu and spinlocks, etc. for
> protecting shared data structures, add new functions required to update


What shared data structures? The per-netns idr which is already protected
by a spinlock? The TC hierarchy? The shared standalone actions? Hey,
why do I have to guess? :-/


> specific actions implementation for parallel execution. (step 2)


Claim is easy, prove is hard. I can easily claim I break RTNL down
to a per-netns lock, but I can't prove it really works. :-D


>
> If you feel that this cover letter is too terse, I will add outline of
> changes in V2.

It is not my rule, it is how you have to help people to review your
14 patches. I think it is a fair game: you help people like me to
review your patches, we help you to get them reviewed and merged
if they all make sense.



>
>> lock down to, for a quick example, a per-device lock? Or perhaps you
>> completely remove it because of what reason?
>
> I want to remove RTNL _dependency_ from act API data structures and
> code. I probably should me more specific in this case:
>
> Florian recently made a change that allows registering netlink protocol
> handlers with flag RTNL_FLAG_DOIT_UNLOCKED. Handlers registered with
> this flag are called without RTNL taken. My end goal is to have rule
> update handlers(RTM_NEWTFILTER, RTM_DELTFILTER, etc.) to be registered
> with UNLOCKED flag to allow parallel execution.


Please add this paragraph in your cover letter, it is very important for review.

>
> I do not intend to globally remove or break RTNL.
>
>>
>> I go through all the descriptions of your 14 patches (but not any code),
>> I still have no clue how you successfully avoid RTNL. Please don't
>> let me read into your code to understand that, there must be some
>> high-level justification on how it works. Without it, I don't event want
>> to read into the code.
>
> On internal code review I've been asked not to duplicate info from
> commit messages in cover letter, but I guess I can expand it with some
> high level outline in V2.

In cover letter, you should put a high-level overview of "why" and "how".
If, in the worst case, on high-level it doesn't make sense, why should
we bother to read the code? In short, you have to convince people to
read your code here.

In each patch description, you should explain what a single patch does.
I don't see any duplication.


Re: [PATCH] IB: Revert "remove redundant INFINIBAND kconfig dependencies"

2018-05-25 Thread Leon Romanovsky
On Fri, May 25, 2018 at 11:29:59PM +0200, Arnd Bergmann wrote:
> Several subsystems depend on INFINIBAND_ADDR_TRANS, which in turn depends
> on INFINIBAND. However, when with CONFIG_INIFIBAND=m, this leads to a
> link error when another driver using it is built-in. The
> INFINIBAND_ADDR_TRANS dependency is insufficient here as this is
> a 'bool' symbol that does not force anything to be a module in turn.
>
> fs/cifs/smbdirect.o: In function `smbd_disconnect_rdma_work':
> smbdirect.c:(.text+0x1e4): undefined reference to `rdma_disconnect'
> net/9p/trans_rdma.o: In function `rdma_request':
> trans_rdma.c:(.text+0x7bc): undefined reference to `rdma_disconnect'
> net/9p/trans_rdma.o: In function `rdma_destroy_trans':
> trans_rdma.c:(.text+0x830): undefined reference to `ib_destroy_qp'
> trans_rdma.c:(.text+0x858): undefined reference to `ib_dealloc_pd'
>
> Fixes: 9533b292a7ac ("IB: remove redundant INFINIBAND kconfig dependencies")
> Signed-off-by: Arnd Bergmann 
> ---
> The patch that introduced the problem has been queued in the
> rdma-fixes/for-rc tree. Please revert the patch before sending
> the branch to Linus.
> ---

It was already sent to Linus.

https://marc.info/?l=linux-rdma=152719509803047=2

Thanks


signature.asc
Description: PGP signature


[PATCH, net-next] net/mlx5e: fix TLS dependency

2018-05-25 Thread Arnd Bergmann
With CONFIG_TLS=m and MLX5_CORE_EN=y, we get a link failure:

drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In function 
`mlx5e_tls_handle_ooo':
tls_rxtx.c:(.text+0x24c): undefined reference to `tls_get_record'
drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In function 
`mlx5e_tls_handle_tx_skb':
tls_rxtx.c:(.text+0x9a8): undefined reference to `tls_device_sk_destruct'

This narrows down the dependency to only allow the configurations
that will actually work. The existing dependency on TLS_DEVICE is
not sufficient here since MLX5_EN_TLS is a 'bool' symbol.

Fixes: c83294b9efa5 ("net/mlx5e: TLS, Add Innova TLS TX support")
Signed-off-by: Arnd Bergmann 
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index ee6684779d11..2545296a0c08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -91,6 +91,7 @@ config MLX5_EN_TLS
bool "TLS cryptography-offload accelaration"
depends on MLX5_CORE_EN
depends on TLS_DEVICE
+   depends on TLS=y || MLX5_CORE=m
depends on MLX5_ACCEL
default n
---help---
-- 
2.9.0



[PATCH, net-next] qcom-emag: hide ACPI specific functions

2018-05-25 Thread Arnd Bergmann
A couple of functions in this file are only used when building with
ACPI enabled, leading to a build warning on most architectures:

drivers/net/ethernet/qualcomm/emac/emac-sgmii.c:284:25: error: 'qdf2400_ops' 
defined but not used [-Werror=unused-variable]
 static struct sgmii_ops qdf2400_ops = {
 ^~~
drivers/net/ethernet/qualcomm/emac/emac-sgmii.c:276:25: error: 'qdf2432_ops' 
defined but not used [-Werror=unused-variable]
 static struct sgmii_ops qdf2432_ops = {

This hides all the unused functions by putting them into the
corresponding #ifdef.

Signed-off-by: Arnd Bergmann 
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c 
b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
index 562420b834df..01b80e0a5367 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
@@ -108,6 +108,7 @@ static void emac_sgmii_link_init(struct emac_adapter *adpt)
writel(val, phy->base + EMAC_SGMII_PHY_AUTONEG_CFG2);
 }
 
+#ifdef CONFIG_ACPI
 static int emac_sgmii_irq_clear(struct emac_adapter *adpt, u8 irq_bits)
 {
struct emac_sgmii *phy = >phy;
@@ -288,6 +289,7 @@ static struct sgmii_ops qdf2400_ops = {
.link_change = emac_sgmii_common_link_change,
.reset = emac_sgmii_common_reset,
 };
+#endif
 
 static int emac_sgmii_acpi_match(struct device *dev, void *data)
 {
-- 
2.9.0



[PATCH, net-next 1/2] bpf: btf: avoid -Wreturn-type warning

2018-05-25 Thread Arnd Bergmann
gcc warns about a noreturn function possibly returning in
some configurations:

kernel/bpf/btf.c: In function 'env_type_is_resolve_sink':
kernel/bpf/btf.c:729:1: error: control reaches end of non-void function 
[-Werror=return-type]

Using BUG() instead of BUG_ON() avoids that warning and otherwise
does the exact same thing.

Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
Signed-off-by: Arnd Bergmann 
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9cbeabb5aca3..2822a0cf4f48 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct 
btf_verifier_env *env,
!btf_type_is_array(next_type) &&
!btf_type_is_struct(next_type);
default:
-   BUG_ON(1);
+   BUG();
}
 }
 
-- 
2.9.0



[PATCH, net-next 2/2] bpf: avoid -Wmaybe-uninitialized warning

2018-05-25 Thread Arnd Bergmann
The stack_map_get_build_id_offset() function is too long for gcc to track
whether 'work' may or may not be initialized at the end of it, leading
to a false-positive warning:

kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset':
kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this 
function [-Werror=maybe-uninitialized]

This removes the 'in_nmi_ctx' flag and uses the state of that variable
itself to see if it got initialized.

Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context")
Signed-off-by: Arnd Bergmann 
---
 kernel/bpf/stackmap.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b59ace0f0f09..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct 
bpf_stack_build_id *id_offs,
 {
int i;
struct vm_area_struct *vma;
-   bool in_nmi_ctx = in_nmi();
bool irq_work_busy = false;
-   struct stack_map_irq_work *work;
+   struct stack_map_irq_work *work = NULL;
 
-   if (in_nmi_ctx) {
+   if (in_nmi()) {
work = this_cpu_ptr(_read_work);
if (work->irq_work.flags & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
@@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct 
bpf_stack_build_id *id_offs,
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
 
-   if (!in_nmi_ctx) {
+   if (!work) {
up_read(>mm->mmap_sem);
} else {
work->sem = >mm->mmap_sem;
-- 
2.9.0



[PATCH] IB: Revert "remove redundant INFINIBAND kconfig dependencies"

2018-05-25 Thread Arnd Bergmann
Several subsystems depend on INFINIBAND_ADDR_TRANS, which in turn depends
on INFINIBAND. However, when with CONFIG_INIFIBAND=m, this leads to a
link error when another driver using it is built-in. The
INFINIBAND_ADDR_TRANS dependency is insufficient here as this is
a 'bool' symbol that does not force anything to be a module in turn.

fs/cifs/smbdirect.o: In function `smbd_disconnect_rdma_work':
smbdirect.c:(.text+0x1e4): undefined reference to `rdma_disconnect'
net/9p/trans_rdma.o: In function `rdma_request':
trans_rdma.c:(.text+0x7bc): undefined reference to `rdma_disconnect'
net/9p/trans_rdma.o: In function `rdma_destroy_trans':
trans_rdma.c:(.text+0x830): undefined reference to `ib_destroy_qp'
trans_rdma.c:(.text+0x858): undefined reference to `ib_dealloc_pd'

Fixes: 9533b292a7ac ("IB: remove redundant INFINIBAND kconfig dependencies")
Signed-off-by: Arnd Bergmann 
---
The patch that introduced the problem has been queued in the
rdma-fixes/for-rc tree. Please revert the patch before sending
the branch to Linus.
---
 drivers/infiniband/ulp/srpt/Kconfig | 2 +-
 drivers/nvme/host/Kconfig   | 2 +-
 drivers/nvme/target/Kconfig | 2 +-
 drivers/staging/lustre/lnet/Kconfig | 2 +-
 fs/cifs/Kconfig | 2 +-
 net/9p/Kconfig  | 2 +-
 net/rds/Kconfig | 2 +-
 net/sunrpc/Kconfig  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/Kconfig 
b/drivers/infiniband/ulp/srpt/Kconfig
index 25bf6955b6d0..fb8b7182f05e 100644
--- a/drivers/infiniband/ulp/srpt/Kconfig
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_SRPT
tristate "InfiniBand SCSI RDMA Protocol target support"
-   depends on INFINIBAND_ADDR_TRANS && TARGET_CORE
+   depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
---help---
 
  Support for the SCSI RDMA Protocol (SRP) Target driver. The
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index dbb7464c018c..88a8b5916624 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -27,7 +27,7 @@ config NVME_FABRICS
 
 config NVME_RDMA
tristate "NVM Express over Fabrics RDMA host driver"
-   depends on INFINIBAND_ADDR_TRANS && BLOCK
+   depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
select NVME_CORE
select NVME_FABRICS
select SG_POOL
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 7595664ee753..3c7b61ddb0d1 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -27,7 +27,7 @@ config NVME_TARGET_LOOP
 
 config NVME_TARGET_RDMA
tristate "NVMe over Fabrics RDMA target support"
-   depends on INFINIBAND_ADDR_TRANS
+   depends on INFINIBAND && INFINIBAND_ADDR_TRANS
depends on NVME_TARGET
select SGL_ALLOC
help
diff --git a/drivers/staging/lustre/lnet/Kconfig 
b/drivers/staging/lustre/lnet/Kconfig
index f3b1ad4bd3dc..ad049e6f24e4 100644
--- a/drivers/staging/lustre/lnet/Kconfig
+++ b/drivers/staging/lustre/lnet/Kconfig
@@ -34,7 +34,7 @@ config LNET_SELFTEST
 
 config LNET_XPRT_IB
tristate "LNET infiniband support"
-   depends on LNET && PCI && INFINIBAND_ADDR_TRANS
+   depends on LNET && PCI && INFINIBAND && INFINIBAND_ADDR_TRANS
default LNET && INFINIBAND
help
  This option allows the LNET users to use infiniband as an
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index d61e2de8d0eb..5f132d59dfc2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -197,7 +197,7 @@ config CIFS_SMB311
 
 config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
-   depends on CIFS=m && INFINIBAND_ADDR_TRANS || CIFS=y && 
INFINIBAND_ADDR_TRANS=y
+   depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && 
INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
help
  Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
  SMB Direct allows transferring SMB packets over RDMA. If unsure,
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index 46c39f7da444..e6014e0e51f7 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -32,7 +32,7 @@ config NET_9P_XEN
 
 
 config NET_9P_RDMA
-   depends on INET && INFINIBAND_ADDR_TRANS
+   depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
tristate "9P RDMA Transport (Experimental)"
help
  This builds support for an RDMA transport.
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index 1a31502ee7db..bffde4b46c5d 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -8,7 +8,7 @@ config RDS
 
 config RDS_RDMA
tristate "RDS over Infiniband"
-   depends on RDS && INFINIBAND_ADDR_TRANS
+   depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
  Allow RDS to use Infiniband as a transport.
  This transport supports RDMA operations.
diff --git 

Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-25 Thread Jakub Kicinski
On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:  
> > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:  
> > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > to not fail, e.g.:
> > > > 
> > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > 
> > > > For devices which VF support depends on loaded FW we have the
> > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > to 0.  Remove the special values completely and simply initialize
> > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > Add a helper for drivers to reset the VF limit back to total.
> > > 
> > > I still can't really make sense out of the changelog.
> > >
> > > I think part of the reason it's confusing is because there are two
> > > things going on:
> > > 
> > >   1) You want this:
> > >   
> > >pci_sriov_set_totalvfs(dev, 0);
> > >x = pci_sriov_get_totalvfs(dev) 
> > > 
> > >  to return 0 instead of total_VFs.  That seems to connect with
> > >  your subject line.  It means "sriov_totalvfs" in sysfs could be
> > >  0, but I don't know how that is useful (I'm sure it is; just
> > >  educate me :))  
> > 
> > Let me just quote the bug report that got filed on our internal bug
> > tracker :)
> > 
> >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> >   then tries to set that as the sriov_numvfs parameter.
> > 
> >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> >   but it's set to max.  When FW is switched to flower*, the correct 
> >   sriov_totalvfs value is presented.
> > 
> > * flower is a project name  
> 
> From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.

Think more of an FPGA which can be reprogrammed at runtime to have
different capabilities than an erratum.  Some FWs simply have no use
for VFs and save resources (and validation time) by not supporting it.

> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
> > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > VFs can be enabled, looks like this is the code:
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> >   
> > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > >  sure what you intend for this.  Is *every* driver supposed to
> > >  call it in .remove()?  Could/should this be done in the core
> > >  somehow instead of depending on every driver?  
> > 
> > Good question, I was just thinking yesterday we may want to call it
> > from the core, but I don't think it's strictly necessary nor always
> > sufficient (we may reload FW without re-probing).
> > 
> > We have a device which supports different number of VFs based on the FW
> > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > support, because it supports max.  So the flow in our driver is this:
> > 
> > load_fw(dev);
> > ...
> > max_vfs = ask_fw_for_max_vfs(dev);
> > if (max_vfs >= 0)
> > return pci_sriov_set_totalvfs(dev, max_vfs);
> > else /* FW didn't tell us, assume max */
> > return pci_sriov_reset_totalvfs(dev); 
> > 
> > We also reset the max on device remove, but that's not strictly
> > necessary.
> > 
> > Other users of pci_sriov_set_totalvfs() always know the value to set
> > the total to (either always get it from FW or it's a constant).
> > 
> > If you prefer we can work out the correct max for those legacy cases in
> > the driver as well, although it seemed cleaner to just ask the core,
> > since it already has total_VFs value handy :)
> >   
> > > I'm also having a hard time connecting your user-space command example
> > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > after some coffee.  
> > 
> > OpenStack assumes it will always be able to set sriov_numvfs to
> > sriov_totalvfs, see this 'if':
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> >   
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 

Inefficient call to ipv6_chk_acast_addr_src in icmp6_send

2018-05-25 Thread Salam Noureddine
Hi,

The call to ipv6_chk_acast_addr_src in icmp6_send can be pretty costly on
systems with a lot of net_devices since it can end up looping through all
net_devices in a net namespace searching for an anycast address. A few
thousand icmp6 error packets can end up consuming a whole CPU.
I am thinking of fixing this by adding a hash table along the lines of
inet6_addr_lst,
providing a fast lookup for anycast addresses. Is that the right way to go?

Thanks,

Salam


[GIT] Networking

2018-05-25 Thread David Miller

Let's begin the holiday weekend with some networking fixes:

1) Whoops need to restrict cfg80211 wiphy names even more to 64
   bytes.  From Eric Biggers.

2) Fix flags being ignored when using kernel_connect() with SCTP, from
   Xin Long.

3) Use after free in DCCP, from Alexey Kodanev.

4) Need to check rhltable_init() return value in ipmr code, from
   Eric Dumazet.

5) XDP handling fixes in virtio_net from Jason Wang.

6) Missing RTA_TABLE in rtm_ipv4_policy[], from Roopa Prabhu.

7) Need to use IRQ disabling spinlocks in mlx4_qp_lookup(), from Jack
   Morgenstein.

8) Prevent out-of-bounds speculation using indexes in BPF, from Daniel
   Borkmann.

9) Fix regression added by AF_PACKET link layer cure, from Willem
   de Bruijn.

10) Correct ENIC dma mask, from Govindarajulu Varadarajan.

11) Missing config options for PMTU tests, from Stefano Brivio.

Please pull, thanks a lot.

The following changes since commit 6741c4bb389da103c0d79ad1961884628900bfe6:

  Merge tag 'mips_fixes_4.17_2' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/mips (2018-05-21 08:58:00 
-0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to eb110410b9f6477726026669f3f0c0567e8241e6:

  ibmvnic: Fix partial success login retries (2018-05-25 16:32:48 -0400)


Alexey Kodanev (1):
  dccp: don't free ccid2_hc_tx_sock struct in dccp_disconnect()

Anders Roxell (2):
  selftests: bpf: config: enable NET_SCH_INGRESS for xdp_meta.sh
  selftests: net: reuseport_bpf_numa: don't fail if no numa support

Andrew Zaborowski (1):
  mac80211_hwsim: Fix radio dump for radio idx 0

Bo Chen (1):
  pcnet32: add an error handling path in pcnet32_probe_pci()

Bob Copeland (1):
  mac80211: mesh: fix premature update of rc stats

Colin Ian King (2):
  batman-adv: don't pass a NULL hard_iface to batadv_hardif_put
  net/mlx4: fix spelling mistake: "Inrerface" -> "Interface" and rephrase 
message

Daniel Borkmann (1):
  bpf: properly enforce index mask to prevent out-of-bounds speculation

David S. Miller (6):
  Merge tag 'mac80211-for-davem-2018-05-23' of 
git://git.kernel.org/.../jberg/mac80211
  Merge branch 'virtio_net-mergeable-XDP'
  Merge tag 'wireless-drivers-for-davem-2018-05-22' of 
git://git.kernel.org/.../kvalo/wireless-drivers
  Merge tag 'mlx5-fixes-2018-05-24' of git://git.kernel.org/.../saeed/linux
  Merge tag 'batadv-net-for-davem-20180524' of 
git://git.open-mesh.org/linux-merge
  Merge git://git.kernel.org/.../bpf/bpf

Dedy Lansky (1):
  nl80211: fix nlmsg allocation in cfg80211_ft_event

Eran Ben Elisha (1):
  net/mlx5e: When RXFCS is set, add FCS data into checksum calculation

Eric Biggers (2):
  cfg80211: further limit wiphy names to 64 bytes
  ppp: remove the PPPIOCDETACH ioctl

Eric Dumazet (1):
  ipmr: properly check rhltable_init() return value

Fabio Estevam (2):
  net: fec: ptp: Switch to SPDX identifier
  net: fec: Add a SPDX identifier

Florian Fainelli (2):
  net: phy: broadcom: Fix auxiliary control register reads
  net: phy: broadcom: Fix bcm_write_exp()

Govindarajulu Varadarajan (1):
  enic: set DMA mask to 47 bit

Haim Dreyfuss (1):
  cfg80211: fix NULL pointer derference when querying regdb

Jack Morgenstein (1):
  net/mlx4: Fix irq-unsafe spinlock usage

Jason Wang (6):
  virtio-net: correctly redirect linearized packet
  virtio-net: correctly transmit XDP buff after linearizing
  virtio-net: correctly check num_buf during err path
  virtio-net: fix leaking page for gso packet during mergeable XDP
  tuntap: correctly set SOCKWQ_ASYNC_NOSPACE
  vhost: synchronize IOTLB message with dev cleanup

Kalle Valo (3):
  MAINTAINERS: update Kalle's email address
  MAINTAINERS: change Kalle as ath.ko maintainer
  MAINTAINERS: change Kalle as wcn36xx maintainer

Linus Lüssing (1):
  batman-adv: Fix TT sync flags for intermediate TT responses

Marek Lindner (1):
  batman-adv: prevent TT request storms by not sending inconsistent TT TLVLs

Nathan Fontenot (1):
  ibmvnic: Only do H_EOI for mobility events

Or Gerlitz (1):
  net : sched: cls_api: deal with egdev path only if needed

Qing Huang (1):
  mlx4_core: allocate ICM memory in page size chunks

Rafał Miłecki (3):
  bcma: fix buffer size caused crash in bcma_core_mips_print_irq()
  Revert "ssb: Prevent build of PCI host features in module"
  ssb: make SSB_PCICORE_HOSTMODE depend on SSB = y

Roopa Prabhu (1):
  net: ipv4: add missing RTA_TABLE to rtm_ipv4_policy

Shahed Shaikh (1):
  qed: Fix mask for physical address in ILT entry

Stefano Brivio (1):
  selftests/net: Add missing config options for PMTU tests

Sven Eckelmann (1):
  batman-adv: Avoid race in TT TVLV allocator helper

Thomas Falcon (1):
  ibmvnic: Fix partial 

Re: [PATCH net-next] net: dsa: dsa_loop: Make dynamic debugging helpful

2018-05-25 Thread David Miller
From: Florian Fainelli 
Date: Thu, 24 May 2018 20:52:14 -0700

> Remove redundant debug prints from phy_read/write since we can trace those
> calls through trace events. Enhance dynamic debug prints to print arguments
> which helps figuring how what is going on at the driver level with higher 
> level
> configuration interfaces.
> 
> Signed-off-by: Florian Fainelli 

Applied, thanks Florian.


Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-25 Thread Bjorn Helgaas
On Fri, May 25, 2018 at 03:27:52PM -0400, Don Dutile wrote:
> On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:
> > On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > > Hi Bjorn!
> > > 
> > > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > > to not fail, e.g.:
> > > > > 
> > > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > > 
> > > > > For devices which VF support depends on loaded FW we have the
> > > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > > to 0.  Remove the special values completely and simply initialize
> > > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > > Add a helper for drivers to reset the VF limit back to total.
> > > > 
> > > > I still can't really make sense out of the changelog.
> > > > 
> > > > I think part of the reason it's confusing is because there are two
> > > > things going on:
> > > > 
> > > >1) You want this:
> > > > pci_sriov_set_totalvfs(dev, 0);
> > > > x = pci_sriov_get_totalvfs(dev)
> > > > 
> > > >   to return 0 instead of total_VFs.  That seems to connect with
> > > >   your subject line.  It means "sriov_totalvfs" in sysfs could be
> > > >   0, but I don't know how that is useful (I'm sure it is; just
> > > >   educate me :))
> > > 
> > > Let me just quote the bug report that got filed on our internal bug
> > > tracker :)
> > > 
> > >When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> > >errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> > >then tries to set that as the sriov_numvfs parameter.
> > > 
> > >For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
> > >but it's set to max.  When FW is switched to flower*, the correct
> > >sriov_totalvfs value is presented.
> > > 
> > > * flower is a project name
> > 
> >  From the point of view of the PCI core (which knows nothing about
> > device firmware and relies on the architected config space described
> > by the PCIe spec), this sounds like an erratum: with some firmware
> > installed, the device is not capable of SR-IOV, but still advertises
> > an SR-IOV capability with "TotalVFs > 0".
> > 
> > Regardless of whether that's an erratum, we do allow PF drivers to use
> > pci_sriov_set_totalvfs() to limit the number of VFs that may be
> > enabled by writing to the PF's "sriov_numvfs" sysfs file.
> > 
> +1.
> 
> > But the current implementation does not allow a PF driver to limit VFs
> > to 0, and that does seem nonsensical.
> > 
> Well, not really -- claiming to support VFs, and then wanting it to be 0...
> I could certainly argue is non-sensical.
> From a sw perspective, sure, see if we can set VFs to 0 (and reset to another 
> value later).
> 
> /me wishes that implementers would follow the architecture vs torquing it 
> into strange shapes.
> 
> > > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > > VFs can be enabled, looks like this is the code:
> > > 
> > > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > > 
> > > >2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > > >   sure what you intend for this.  Is *every* driver supposed to
> > > >   call it in .remove()?  Could/should this be done in the core
> > > >   somehow instead of depending on every driver?
> > > 
> > > Good question, I was just thinking yesterday we may want to call it
> > > from the core, but I don't think it's strictly necessary nor always
> > > sufficient (we may reload FW without re-probing).
> > > 
> > > We have a device which supports different number of VFs based on the FW
> > > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > > support, because it supports max.  So the flow in our driver is this:
> > > 
> > > load_fw(dev);
> > > ...
> > > max_vfs = ask_fw_for_max_vfs(dev);
> > > if (max_vfs >= 0)
> > >   return pci_sriov_set_totalvfs(dev, max_vfs);
> > > else /* FW didn't tell us, assume max */
> > >   return pci_sriov_reset_totalvfs(dev);
> > > 
> > > We also reset the max on device remove, but that's not strictly
> > > necessary.
> > > 
> > > Other users of pci_sriov_set_totalvfs() always know the value to set
> > > the total to (either always get it from FW or it's a constant).
> > > 
> > > If you prefer we can work out the correct max for those legacy cases in
> > > the driver as well, although it seemed cleaner to just ask the core,
> > > since it already has total_VFs value handy :)
> > > 
> > > > I'm also having a hard time connecting your user-space command example
> > > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > > after some 

Re: [PATCH v4 2/3] media: rc: introduce BPF_PROG_LIRC_MODE2

2018-05-25 Thread Alexei Starovoitov
On Fri, May 18, 2018 at 03:07:29PM +0100, Sean Young wrote:
> Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
> rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
> that the last key should be repeated.
> 
> The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
> the target_fd must be the /dev/lircN device.
> 
> Signed-off-by: Sean Young 
...
>  enum bpf_attach_type {
> @@ -158,6 +159,7 @@ enum bpf_attach_type {
>   BPF_CGROUP_INET6_CONNECT,
>   BPF_CGROUP_INET4_POST_BIND,
>   BPF_CGROUP_INET6_POST_BIND,
> + BPF_LIRC_MODE2,
>   __MAX_BPF_ATTACH_TYPE
>  };
>  
> @@ -1902,6 +1904,53 @@ union bpf_attr {
>   *   egress otherwise). This is the only flag supported for now.
>   *   Return
>   *   **SK_PASS** on success, or **SK_DROP** on error.
> + *
> + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
> + *   Description
> + *   This helper is used in programs implementing IR decoding, to
> + *   report a successfully decoded key press with *scancode*,
> + *   *toggle* value in the given *protocol*. The scancode will be
> + *   translated to a keycode using the rc keymap, and reported as
> + *   an input key down event. After a period a key up event is
> + *   generated. This period can be extended by calling either
> + *   **bpf_rc_keydown** () with the same values, or calling
> + *   **bpf_rc_repeat** ().
> + *
> + *   Some protocols include a toggle bit, in case the button
> + *   was released and pressed again between consecutive scancodes
> + *
> + *   The *ctx* should point to the lirc sample as passed into
> + *   the program.
> + *
> + *   The *protocol* is the decoded protocol number (see
> + *   **enum rc_proto** for some predefined values).
> + *
> + *   This helper is only available is the kernel was compiled with
> + *   the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + *   "**y**".
> + *
> + *   Return
> + *   0
> + *
> + * int bpf_rc_repeat(void *ctx)
> + *   Description
> + *   This helper is used in programs implementing IR decoding, to
> + *   report a successfully decoded repeat key message. This delays
> + *   the generation of a key up event for previously generated
> + *   key down event.
> + *
> + *   Some IR protocols like NEC have a special IR message for
> + *   repeating last button, for when a button is held down.
> + *
> + *   The *ctx* should point to the lirc sample as passed into
> + *   the program.
> + *
> + *   This helper is only available is the kernel was compiled with
> + *   the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + *   "**y**".

Hi Sean,

thank you for working on this. The patch set looks good to me.
I'd only ask to change above two helper names to something more specific.
Since BPF_PROG_TYPE_LIRC_MODE2 is the name of new prog type and kconfig.
May be bpf_lirc2_keydown() and bpf_lirc2_repeat() ?

> @@ -1576,6 +1577,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>   case BPF_SK_SKB_STREAM_PARSER:
>   case BPF_SK_SKB_STREAM_VERDICT:
>   return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
> + case BPF_LIRC_MODE2:
> + return rc_dev_prog_attach(attr);
...
> + case BPF_LIRC_MODE2:
> + return rc_dev_prog_detach(attr);

and similar rename for internal function names that go into bpf core.

Please add accumulated acks when you respin.

Thanks



Re: [PATCH net-next v5 0/2] openvswitch: Support conntrack zone limit

2018-05-25 Thread David Miller
From: Yi-Hung Wei 
Date: Thu, 24 May 2018 17:56:41 -0700

> Currently, nf_conntrack_max is used to limit the maximum number of
> conntrack entries in the conntrack table for every network namespace.
> For the VMs and containers that reside in the same namespace,
> they share the same conntrack table, and the total # of conntrack entries
> for all the VMs and containers are limited by nf_conntrack_max.  In this
> case, if one of the VM/container abuses the usage the conntrack entries,
> it blocks the others from committing valid conntrack entries into the
> conntrack table.  Even if we can possibly put the VM in different network
> namespace, the current nf_conntrack_max configuration is kind of rigid
> that we cannot limit different VM/container to have different # conntrack
> entries.
> 
> To address the aforementioned issue, this patch proposes to have a
> fine-grained mechanism that could further limit the # of conntrack entries
> per-zone.  For example, we can designate different zone to different VM,
> and set conntrack limit to each zone.  By providing this isolation, a
> mis-behaved VM only consumes the conntrack entries in its own zone, and
> it will not influence other well-behaved VMs.  Moreover, the users can
> set various conntrack limit to different zone based on their preference.
> 
> The proposed implementation utilizes Netfilter's nf_conncount backend
> to count the number of connections in a particular zone.  If the number of
> connection is above a configured limitation, OVS will return ENOMEM to the
> userspace.  If userspace does not configure the zone limit, the limit
> defaults to zero that is no limitation, which is backward compatible to
> the behavior without this patch.
> 
> The first patch defines the conntrack limit netlink definition, and the
> second patch provides the implementation.
 ...

Series applied, thanks for sticking with it so long and responding to the
feedback you received.


Re: [PATCH net-next] ifb: fix packets checksum

2018-05-25 Thread David Miller
From: Jon Maxwell 
Date: Fri, 25 May 2018 07:38:29 +1000

> Fixup the checksum for CHECKSUM_COMPLETE when pulling skbs on RX path. 
> Otherwise we get splats when tc mirred is used to redirect packets to ifb.
> 
> Before fix:
> 
> nic: hw csum failure
> 
> Signed-off-by: Jon Maxwell 

This definitely seems correct, but I am really surprised a bug like this has
lasted as long as it has.

So I'll let this sit for another day or two for review.


Re: [pull request][net-next V2 0/6] Mellanox, mlx5e updates 2018-05-19

2018-05-25 Thread David Miller
From: Saeed Mahameed 
Date: Thu, 24 May 2018 14:38:14 -0700

> This is a mlx5e only pull request, for more information please see tag
> log below.
> 
> Please pull and let me know if there's any problem.
> 
> v1->v2:
> 1) patch #1 commit message: lldptool usage example and explanation on why 
>dcbnl is selected over devlink interface as was agreed on mailing list.
> 
> 2) patches #1 and #6: Add total_size in dcbnl_buffer to report the total
>available buffer size of the netdev, as suggested by John.
> 
> 3) Added Reviewed-by tag to all the patches.

Ok, thanks for the discussion and details in patch #1.

Pulled, thanks.


Re: [PATCH 00/14] Modify action API for implementing lockless actions

2018-05-25 Thread Vlad Buslov

On Thu 24 May 2018 at 23:34, Cong Wang  wrote:
> On Mon, May 14, 2018 at 7:27 AM, Vlad Buslov  wrote:
>> Currently, all netlink protocol handlers for updating rules, actions and
>> qdiscs are protected with single global rtnl lock which removes any
>> possibility for parallelism. This patch set is a first step to remove
>> rtnl lock dependency from TC rules update path. It updates act API to
>> use atomic operations, rcu and spinlocks for fine-grained locking. It
>> also extend API with functions that are needed to update existing
>> actions for parallel execution.
>
> Can you give a summary here for what and how it is achieved?

Got it, will expand cover letter in V2 with summary.
>
> You said this is the first step, what do you want to achieve in this
> very first step? And how do you achieve it? Do you break the RTNL

But aren't this questions answered in paragraph you quoted?
What: Change act API to not rely on one-big-global-RTNL-lock and to use
more fine-grained synchronization methods to allow safe concurrent
execution.
How: Refactor act API code to use atomics, rcu and spinlocks, etc. for
protecting shared data structures, add new functions required to update
specific actions implementation for parallel execution. (step 2)

If you feel that this cover letter is too terse, I will add outline of
changes in V2.

> lock down to, for a quick example, a per-device lock? Or perhaps you
> completely remove it because of what reason?

I want to remove RTNL _dependency_ from act API data structures and
code. I probably should me more specific in this case:

Florian recently made a change that allows registering netlink protocol
handlers with flag RTNL_FLAG_DOIT_UNLOCKED. Handlers registered with
this flag are called without RTNL taken. My end goal is to have rule
update handlers(RTM_NEWTFILTER, RTM_DELTFILTER, etc.) to be registered
with UNLOCKED flag to allow parallel execution.

I do not intend to globally remove or break RTNL.

>
> I go through all the descriptions of your 14 patches (but not any code),
> I still have no clue how you successfully avoid RTNL. Please don't
> let me read into your code to understand that, there must be some
> high-level justification on how it works. Without it, I don't event want
> to read into the code.

On internal code review I've been asked not to duplicate info from
commit messages in cover letter, but I guess I can expand it with some
high level outline in V2.

>
> Thanks.

Thank you for your feedback!


Re: [PATCH] 8139too: Remove unnecessary netif_napi_del()

2018-05-25 Thread David Miller
From: Bo Chen 
Date: Thu, 24 May 2018 12:48:35 -0700

> The call to free_netdev() in __rtl8139_cleanup_dev() clears the network device
> napi list, and explicit calls to netif_napi_del() are unnecessary.
> 
> Signed-off-by: Bo Chen 

Since this is just unnecessary work and not a bug, applied to net-next.

Thanks.


Re: [PATCH net] ibmvnic: Fix partial success login retries

2018-05-25 Thread David Miller
From: Thomas Falcon 
Date: Thu, 24 May 2018 14:37:53 -0500

> In its current state, the driver will handle backing device
> login in a loop for a certain number of retries while the
> device returns a partial success, indicating that the driver
> may need to try again using a smaller number of resources.
> 
> The variable it checks to continue retrying may change
> over the course of operations, resulting in reallocation
> of resources but exits without sending the login attempt.
> Guard against this by introducing a boolean variable that
> will retain the state indicating that the driver needs to
> reattempt login with backing device firmware.
> 
> Signed-off-by: Thomas Falcon 

Applied.


Re: Poor TCP performance with XPS enabled after scrubbing skb

2018-05-25 Thread David Miller
From: Flavio Leitner 
Date: Thu, 24 May 2018 16:17:29 -0300

> veth originally called skb_orphan() on veth_xmit() most probably
> because there was no TX completion. Then the code got generalized to
> dev_forward_skb() and later on moved to skb_scrub_packet().
> 
> The issue is that we call skb_scrub_packet() on TX and RX paths and
> that is done while crossing netns.  It doesn't look correct to keep
> the ->sk because I suspect that iptables/selinux/bpf, or some code
> path that I am probably missing could expose/use the wrong ->sk, for
> example.
> 
> However, netdev_pick_tx() can't store the queue mapping without ->sk.
> 
> The hack in the first email relies on the headers (skb_tx_hash) to
> always selected the same TX queue, which solves the original problem
> but not the TCP small queues you mentioned.

Right, we can't allow a socket reference to escape over a netns
crossing.

However, that is where we get the queue mapping state.

We might need to put the sk based decision into the skb somehow in
order to satisfy these two incompatibel requirements.


Re: [PATCH net-next] tcp: use data length instead of skb->len in tcp_probe

2018-05-25 Thread David Miller
From: Song Liu 
Date: Thu, 24 May 2018 17:44:46 +

> We should also rename __entry->length to __entry->data_len, so that whoever
> using this field will notice the change. 

Agreed.


Re: [PATCH net-next 0/5] qed*: ethtool rx flow classification enhancements.

2018-05-25 Thread David Miller
From: Manish Chopra 
Date: Thu, 24 May 2018 09:54:48 -0700

> This series re-structures the driver's ethtool rx flow
> classification flow, following that it adds other flow
> profiles and rx flow classification enhancements
> via "ethtool -N/-U"
> 
> Please consider applying this to "net-next"

The code is definitely easier to read and understand, especially after
patch #1.

Series applied, thank you.


Re: [PATCH net-next] vrf: add CRC32c offload to device features

2018-05-25 Thread David Ahern
On 5/24/18 9:49 AM, Davide Caratti wrote:
> SCTP sockets originated in a VRF can improve their performance if CRC32c
> computation is delegated to underlying devices: update device features,
> setting NETIF_F_SCTP_CRC. Iterating the following command in the topology
> proposed with [1],
> 
>  # ip vrf exec vrf-h2 netperf -H 192.0.2.1 -t SCTP_STREAM -- -m 10K
> 
> the measured throughput in Mbit/s improved from 2395 ± 1% to 2720 ± 1%.
> 
> [1] https://www.spinics.net/lists/netdev/msg486007.html
> 
> Signed-off-by: Davide Caratti 
> ---
>  drivers/net/vrf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: David Ahern 


Re: pull-request: bpf 2018-05-24

2018-05-25 Thread Daniel Borkmann
On 05/25/2018 09:51 PM, David Miller wrote:
> From: Daniel Borkmann 
> Date: Thu, 24 May 2018 18:38:02 +0200
> 
>> The following pull-request contains BPF updates for your *net* tree.
>>
>> The main changes are:
>>
>> 1) Fix a bug in the original fix to prevent out of bounds speculation when
>>multiple tail call maps from different branches or calls end up at the
>>same tail call helper invocation, from Daniel.
>>
>> 2) Two selftest fixes, one in reuseport_bpf_numa where test is skipped in
>>case of missing numa support and another one to update kernel config to
>>properly support xdp_meta.sh test, from Anders.
>>
>> Please consider pulling these changes from:
>>
>>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
> 
> Pulled, thanks Danile.
> 
>> Would be great if you have a chance to merge net into net-next after that.
>>
>> The verifier fix would be needed later as a dependency in bpf-next for
>> upcomig work there. When you do the merge there's a trivial conflict on
>> BPF side with 849fa50662fb ("bpf/verifier: refine retval R0 state for
>> bpf_get_stack helper"): Resolution is to keep both functions, the
>> do_refine_retval_range() and record_func_map().
> 
> I'll try to push it along as soon as I can.
> 
> Thanks for the merge conflict heads-up.

Awesome, thanks a lot David!


Re: pull-request: bpf 2018-05-24

2018-05-25 Thread David Miller
From: Daniel Borkmann 
Date: Thu, 24 May 2018 18:38:02 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
> 
> 1) Fix a bug in the original fix to prevent out of bounds speculation when
>multiple tail call maps from different branches or calls end up at the
>same tail call helper invocation, from Daniel.
> 
> 2) Two selftest fixes, one in reuseport_bpf_numa where test is skipped in
>case of missing numa support and another one to update kernel config to
>properly support xdp_meta.sh test, from Anders.
> 
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Danile.

> Would be great if you have a chance to merge net into net-next after that.
> 
> The verifier fix would be needed later as a dependency in bpf-next for
> upcomig work there. When you do the merge there's a trivial conflict on
> BPF side with 849fa50662fb ("bpf/verifier: refine retval R0 state for
> bpf_get_stack helper"): Resolution is to keep both functions, the
> do_refine_retval_range() and record_func_map().

I'll try to push it along as soon as I can.

Thanks for the merge conflict heads-up.


[PATCH 2/2] batman-adv: Drop "experimental" from BATMAN_V Kconfig

2018-05-25 Thread Sven Eckelmann
The Kconfig option BATMAN_ADV_BATMAN_V is now enabled by default when the
BATMAN_ADV is enabled. A feature which is enabled by default for a module
should not be considered experimental.

Reported-by: Joe Perches 
Signed-off-by: Sven Eckelmann 
---
 net/batman-adv/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 41bb67d70c83..da0b7aa98be9 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -32,7 +32,7 @@ config BATMAN_ADV
   tools.
 
 config BATMAN_ADV_BATMAN_V
-   bool "B.A.T.M.A.N. V protocol (experimental)"
+   bool "B.A.T.M.A.N. V protocol"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default y
help
-- 
2.17.0



[PATCH 1/2] batman-adv: Remove "default n" in Kconfig

2018-05-25 Thread Sven Eckelmann
The "default n" is the default value for any bool or tristate Kconfig
setting. It is therefore not necessary to add it to the an config entry.

Reported-by: Sergei Shtylyov 
Signed-off-by: Sven Eckelmann 
---
 net/batman-adv/Kconfig | 5 -
 1 file changed, 5 deletions(-)

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index de8034d80623..41bb67d70c83 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -24,7 +24,6 @@ config BATMAN_ADV
depends on NET
select CRC16
select LIBCRC32C
-default n
help
   B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
   a routing protocol for multi-hop ad-hoc mesh networks. The
@@ -60,7 +59,6 @@ config BATMAN_ADV_BLA
 config BATMAN_ADV_DAT
bool "Distributed ARP Table"
depends on BATMAN_ADV && INET
-   default n
help
  This option enables DAT (Distributed ARP Table), a DHT based
  mechanism that increases ARP reliability on sparse wireless
@@ -70,7 +68,6 @@ config BATMAN_ADV_DAT
 config BATMAN_ADV_NC
bool "Network Coding"
depends on BATMAN_ADV
-   default n
help
  This option enables network coding, a mechanism that aims to
  increase the overall network throughput by fusing multiple
@@ -84,7 +81,6 @@ config BATMAN_ADV_NC
 config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
-   default n
help
  This option enables the multicast optimisation which aims to
  reduce the air overhead while improving the reliability of
@@ -94,7 +90,6 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
-   default n
help
  Enable this to export routing related debug tables via debugfs.
  The information for each soft-interface and used hard-interface can be
-- 
2.17.0



Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-25 Thread Song Liu
On Fri, May 25, 2018 at 11:11 AM, Song Liu  wrote:
> Summary:
>
> At the end of sch_direct_xmit(), we are in the else path of
> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
> condition will always fail and netif_xmit_frozen_or_stopped() is not
> checked at all.
>
> if (ret && netif_xmit_frozen_or_stopped(txq))
>  return false;
>
> In this patch, this condition is fixed as:
>
> if (netif_xmit_frozen_or_stopped(txq))
>  return false;
>
> and further simplifies the code as:
>
> return !netif_xmit_frozen_or_stopped(txq);
>
> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
> xmit path")
> Cc: John Fastabend 
> Cc: David S. Miller 
> Signed-off-by: Song Liu 
> ---
>  net/sched/sch_generic.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 39c144b..8261d48 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -346,10 +346,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc 
> *q,
> return false;
> }
>
> -   if (ret && netif_xmit_frozen_or_stopped(txq))
> -   return false;
> -
> -   return true;
> +   return !netif_xmit_frozen_or_stopped(txq);
>  }
>
>  /*
> --
> 2.9.5
>

Alexei and I discussed about this offline. We would like to share our
discussion here to
clarify the motivation.

Before 29b86cdac00a, ret in condition "if (ret &&
netif_xmit_frozen_or_stopped()" is not
the value from dev_hard_start_xmit(), because ret is overwritten by
either qdisc_qlen()
or dev_requeue_skb(). Therefore, 29b86cdac00a changed the behavior of
this condition.

For ret from dev_hard_start_xmit(), I dig into the function and found
it is from return value
of ndo_start_xmit(). Per netdevice.h, ndo_start_xmit() should only
return NETDEV_TX_OK
or NETDEV_TX_BUSY. I survey many drivers, and they all follow the rule. The only
exception is vlan.

Given ret could only be NETDEV_TX_OK or NETDEV_TX_BUSY (ignore vlan for now),
if it fails condition "if (!dev_xmit_complete(ret))", ret must be
NETDEV_TX_OK == 0. So
netif_xmit_frozen_or_stopped() will always be bypassed.

It is probably OK to ignore netif_xmit_frozen_or_stopped(), and return true from
sch_direct_xmit(), as I didn't see that break any functionality. But
it is more like "correct
by accident" to me. This is the motivation of my original patch.

Alexei pointed out that, the following condition is more like original logic:

  if (qdisc_qlen(q) && netif_xmit_frozen_or_stopped(txq))
return false;

However, I think John would like to remove qdisc_qlen() from the tx
path. I didn't see
any issue without the extra qdisc_qlen() check, so the patch is
probably good AS-IS.

Please share your comments and feedback on this.

Thanks,
Song


Re: [PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs

2018-05-25 Thread Alexei Starovoitov
On Fri, May 25, 2018 at 06:50:09PM +0200, Eugene Syromiatnikov wrote:
> On Thu, May 24, 2018 at 04:34:51PM -0700, Alexei Starovoitov wrote:
> > On Thu, May 24, 2018 at 09:41:08AM +0200, Jesper Dangaard Brouer wrote:
> > > On Wed, 23 May 2018 15:02:45 -0700
> > > Alexei Starovoitov  wrote:
> > > 
> > > > On Wed, May 23, 2018 at 02:18:19PM +0200, Eugene Syromiatnikov wrote:
> > > > > Some BPF sysctl knobs affect the loading of BPF programs, and during
> > > > > system boot/init stages these sysctls are not yet configured.
> > > > > A concrete example is systemd, that has implemented loading of BPF
> > > > > programs.
> > > > > 
> > > > > Thus, to allow controlling these setting at early boot, this patch set
> > > > > adds the ability to change the default setting of these sysctl knobs
> > > > > as well as option to override them via a boot-time kernel parameter
> > > > > (in order to avoid rebuilding kernel each time a need of changing 
> > > > > these
> > > > > defaults arises).
> > > > > 
> > > > > The sysctl knobs in question are kernel.unprivileged_bpf_disable,
> > > > > net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.  
> > > > 
> > > > - systemd is root. today it only uses cgroup-bpf progs which require 
> > > > root,
> > > >   so disabling unpriv during boot time makes no difference to systemd.
> > > >   what is the actual reason to present time?
> systemd also runs a lot of code, some of which is unprivileged.

systemd processes sysctl configs first. It's essential for system
security to do so. If you have concerns in how systemd does that
please bring it up with systemd folks.

> > > > - say in the future systemd wants to use so_reuseport+bpf for faster
> > > >   networking. With unpriv disable during boot, it will force systemd
> > > >   to do such networking from root, which will lower its security 
> > > > barrier.
> No, it will force systemd not to use SO_REUSEPORT BPF.

sorry this argument makes no sense to me.

> > > > - bpf_jit_kallsyms sysctl has immediate effect on loaded programs.
> > > >   Flipping it during the boot or right after or any time after
> > > >   is the same thing. Why add such boot flag then?
> Well, that one was for completeness.

Should we convert all sysctls to bootparams for 'completeness' ?

> > > > - jit_harden can be turned on by systemd. so turning it during the boot
> > > >   will make systemd progs to be constant blinded.
> > > >   Constant blinding protects kernel from unprivileged JIT spraying.
> > > >   Are you worried that systemd will attack the kernel with JIT spraying?
> I'm worried that systemd can be exploited for a JIT spraying attack.

I'm afraid we're not on the same page with definition of 'JIT spraying attack'.

> Another thing I'm concerned with is that the generated code is different,
> which introduces additional complication during debugging.

specifically what kind of complication?



Re: [PATCH] net: netsec: reduce DMA mask to 40 bits

2018-05-25 Thread Robin Murphy
On Sat, 26 May 2018 00:33:05 +0530
Jassi Brar  wrote:

> On 25 May 2018 at 18:20, Ard Biesheuvel 
> wrote:
> > The netsec network controller IP can drive 64 address bits for DMA,
> > and the DMA mask is set accordingly in the driver. However, the
> > SynQuacer SoC, which is the only silicon incorporating this IP at
> > the moment, integrates this IP in a manner that leaves address bits
> > [63:40] unconnected.
> >
> > Up until now, this has not resulted in any problems, given that the
> > DDR controller doesn't decode those bits to begin with. However,
> > recent firmware updates for platforms incorporating this SoC allow
> > the IOMMU to be enabled, which does decode address bits [47:40],
> > and allocates top down from the IOVA space, producing DMA addresses
> > that have bits set that have been left unconnected.
> >
> > Both the DT and ACPI (IORT) descriptions of the platform take this
> > into account, and only describe a DMA address space of 40 bits
> > (using either dma-ranges DT properties, or DMA address limits in
> > IORT named component nodes). However, even though our IOMMU and bus
> > layers may take such limitations into account by setting a narrower
> > DMA mask when creating the platform device, the netsec probe()
> > entrypoint follows the common practice of setting the DMA mask
> > uncondionally, according to the capabilities of the IP block itself
> > rather than to its integration into the chip.
> >
> > It is currently unclear what the correct fix is here. We could hack
> > around it by only setting the DMA mask if it deviates from its
> > default value of DMA_BIT_MASK(32). However, this makes it
> > impossible for the bus layer to use DMA_BIT_MASK(32) as the bus
> > limit, and so it appears that a more comprehensive approach is
> > required to take DMA limits imposed by the SoC as a whole into
> > account.
> >
> > In the mean time, let's limit the DMA mask to 40 bits. Given that
> > there is currently only one SoC that incorporates this IP, this is
> > a reasonable approach that can be backported to -stable and buys us
> > some time to come up with a proper fix going forward.
> >  
> I am sure you already thought about it, but why not let the platform
> specify the bit mask for the driver (via some "bus-width" property),
> to override the default 64 bit mask?

Because lack of a property to describe the integration is not the
problem. There are already at least two ways: the general DT/IORT
properties for describing DMA addressing - which it would be a bit
ungainly for a driver to parse for this reason, but not impossible -
and inferring it from a SoC-specific compatible - which is more
appropriate, and what we happen to be able to do here.

Robin.


Re: [PATCH net-next] vrf: add CRC32c offload to device features

2018-05-25 Thread David Miller
From: Davide Caratti 
Date: Thu, 24 May 2018 17:49:35 +0200

> SCTP sockets originated in a VRF can improve their performance if CRC32c
> computation is delegated to underlying devices: update device features,
> setting NETIF_F_SCTP_CRC. Iterating the following command in the topology
> proposed with [1],
> 
>  # ip vrf exec vrf-h2 netperf -H 192.0.2.1 -t SCTP_STREAM -- -m 10K
> 
> the measured throughput in Mbit/s improved from 2395 ± 1% to 2720 ± 1%.
> 
> [1] https://www.spinics.net/lists/netdev/msg486007.html
> 
> Signed-off-by: Davide Caratti 

David A., please review.


Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-25 Thread Don Dutile

On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:

On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:

Hi Bjorn!

On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:

On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:

Some user space depends on enabling sriov_totalvfs number of VFs
to not fail, e.g.:

$ cat .../sriov_totalvfs > .../sriov_numvfs

For devices which VF support depends on loaded FW we have the
pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
a special "unset" value, meaning drivers can't limit sriov_totalvfs
to 0.  Remove the special values completely and simply initialize
driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
Add a helper for drivers to reset the VF limit back to total.


I still can't really make sense out of the changelog.

I think part of the reason it's confusing is because there are two
things going on:

   1) You want this:
   
pci_sriov_set_totalvfs(dev, 0);

x = pci_sriov_get_totalvfs(dev)

  to return 0 instead of total_VFs.  That seems to connect with
  your subject line.  It means "sriov_totalvfs" in sysfs could be
  0, but I don't know how that is useful (I'm sure it is; just
  educate me :))


Let me just quote the bug report that got filed on our internal bug
tracker :)

   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
   then tries to set that as the sriov_numvfs parameter.

   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
   but it's set to max.  When FW is switched to flower*, the correct
   sriov_totalvfs value is presented.

* flower is a project name


 From the point of view of the PCI core (which knows nothing about
device firmware and relies on the architected config space described
by the PCIe spec), this sounds like an erratum: with some firmware
installed, the device is not capable of SR-IOV, but still advertises
an SR-IOV capability with "TotalVFs > 0".

Regardless of whether that's an erratum, we do allow PF drivers to use
pci_sriov_set_totalvfs() to limit the number of VFs that may be
enabled by writing to the PF's "sriov_numvfs" sysfs file.


+1.


But the current implementation does not allow a PF driver to limit VFs
to 0, and that does seem nonsensical.


Well, not really -- claiming to support VFs, and then wanting it to be 0...
I could certainly argue is non-sensical.
From a sw perspective, sure, see if we can set VFs to 0 (and reset to another 
value later).

/me wishes that implementers would follow the architecture vs torquing it into 
strange shapes.


My understanding is OpenStack uses sriov_totalvfs to determine how many
VFs can be enabled, looks like this is the code:

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464


   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
  sure what you intend for this.  Is *every* driver supposed to
  call it in .remove()?  Could/should this be done in the core
  somehow instead of depending on every driver?


Good question, I was just thinking yesterday we may want to call it
from the core, but I don't think it's strictly necessary nor always
sufficient (we may reload FW without re-probing).

We have a device which supports different number of VFs based on the FW
loaded.  Some legacy FWs does not inform the driver how many VFs it can
support, because it supports max.  So the flow in our driver is this:

load_fw(dev);
...
max_vfs = ask_fw_for_max_vfs(dev);
if (max_vfs >= 0)
return pci_sriov_set_totalvfs(dev, max_vfs);
else /* FW didn't tell us, assume max */
return pci_sriov_reset_totalvfs(dev);

We also reset the max on device remove, but that's not strictly
necessary.

Other users of pci_sriov_set_totalvfs() always know the value to set
the total to (either always get it from FW or it's a constant).

If you prefer we can work out the correct max for those legacy cases in
the driver as well, although it seemed cleaner to just ask the core,
since it already has total_VFs value handy :)


I'm also having a hard time connecting your user-space command example
with the rest of this.  Maybe it will make more sense to me tomorrow
after some coffee.


OpenStack assumes it will always be able to set sriov_numvfs to
sriov_totalvfs, see this 'if':

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512


Thanks for educating me.  I think there are two issues here that we
can separate.  I extracted the patch below for the first.

The second is the question of resetting driver_max_VFs.  I think we
currently have a general issue in the core:

   - load PF driver 1
   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
   - unload PF driver 1
   - load PF driver 2

Now driver_max_VFs is still stuck at the lower value set by driver 1.
I don't think that's the 

Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-25 Thread Neil Horman
On Sat, May 26, 2018 at 01:41:02AM +0800, Xin Long wrote:
> syzbot reported a rcu_sched self-detected stall on CPU which is caused
> by too small value set on rto_min with SCTP_RTOINFO sockopt. With this
> value, hb_timer will get stuck there, as in its timer handler it starts
> this timer again with this value, then goes to the timer handler again.
> 
> This problem is there since very beginning, and thanks to Eric for the
> reproducer shared from a syzbot mail.
> 
> This patch fixes it by not allowing to set rto_min with a value below
> 200 msecs, which is based on TCP's, by either setsockopt or sysctl.
> 
> Reported-by: syzbot+3dcd59a1f907245f8...@syzkaller.appspotmail.com
> Suggested-by: Marcelo Ricardo Leitner 
> Signed-off-by: Xin Long 
> ---
>  include/net/sctp/constants.h |  1 +
>  net/sctp/socket.c| 10 +++---
>  net/sctp/sysctl.c|  3 ++-
>  3 files changed, 10 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 20ff237..2ee7a7b 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -277,6 +277,7 @@ enum { SCTP_MAX_GABS = 16 };
>  #define SCTP_RTO_INITIAL (3 * 1000)
>  #define SCTP_RTO_MIN (1 * 1000)
>  #define SCTP_RTO_MAX (60 * 1000)
> +#define SCTP_RTO_HARD_MIN200
>  
>  #define SCTP_RTO_ALPHA  3   /* 1/8 when converted to right shifts. */
>  #define SCTP_RTO_BETA   2   /* 1/4 when converted to right shifts. */
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index ae7e7c6..6ef12c7 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3029,7 +3029,8 @@ static int sctp_setsockopt_nodelay(struct sock *sk, 
> char __user *optval,
>   * be changed.
>   *
>   */
> -static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, 
> unsigned int optlen)
> +static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval,
> +unsigned int optlen)
>  {
>   struct sctp_rtoinfo rtoinfo;
>   struct sctp_association *asoc;
> @@ -3056,10 +3057,13 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, 
> char __user *optval, unsigne
>   else
>   rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;
>  
> - if (rto_min)
> + if (rto_min) {
> + if (rto_min < SCTP_RTO_HARD_MIN)
> + return -EINVAL;
>   rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min;
> - else
> + } else {
>   rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min;
> + }
>  
>   if (rto_min > rto_max)
>   return -EINVAL;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index 33ca5b7..7ec854a 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -52,6 +52,7 @@ static int rto_alpha_min = 0;
>  static int rto_beta_min = 0;
>  static int rto_alpha_max = 1000;
>  static int rto_beta_max = 1000;
> +static int rto_hard_min = SCTP_RTO_HARD_MIN;
>  
>  static unsigned long max_autoclose_min = 0;
>  static unsigned long max_autoclose_max =
> @@ -116,7 +117,7 @@ static struct ctl_table sctp_net_table[] = {
>   .maxlen = sizeof(unsigned int),
>   .mode   = 0644,
>   .proc_handler   = proc_sctp_do_rto_min,
> - .extra1 = ,
> + .extra1 = _hard_min,
>   .extra2 = _net.sctp.rto_max
>   },
>   {
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Patch looks fine, you probably want to note this hard minimum in man(7) sctp as
well

Acked-by: Neil Horman 



Re: [PATCH net] selftests/net: Add missing config options for PMTU tests

2018-05-25 Thread David Miller
From: Stefano Brivio 
Date: Thu, 24 May 2018 16:10:12 +0200

> PMTU tests in pmtu.sh need support for VTI, VTI6 and dummy
> interfaces: add them to config file.
> 
> Reported-by: Naresh Kamboju 
> Fixes: d1f1b9cbf34c ("selftests: net: Introduce first PMTU test")
> Signed-off-by: Stefano Brivio 

Applied, thank you.


Re: [PATCH net-next] cxgb4/cxgb4vf: Notify link changes to OS-dependent code

2018-05-25 Thread David Miller
From: Ganesh Goudar 
Date: Thu, 24 May 2018 19:33:37 +0530

> From: Arjun Vynipadath 
> 
> We have a confusion of two different abstractions in the Common
> Code:  Physical Link (Port) and Logical Network Interface (Virtual
> Interface), and we haven't been properly managing the state of the
> intersection of those two abstractions.
> On the one hand we have the Physical state of the Link -- up or down --
> and on the other we have the logical state of the VI, enabled or not.
> {ethN} refers to both the Physical and Logical State. In this case,
> ifconfig only affects/interrogates the Logical State of a VI,
> and ethtool only deals with the Physical State. And these are different.
> 
> So, just because we disable the VI, we don't really want to change the
> Physical Link Up/Down state.  Thus, the previous hack to set
> "lc->link_ok = 0" when we disable a VI is completely incorrect.
> 
> Where we get into trouble is where the Physical Link State and the
> Logical VI State cross swords.  And that happens in
> t4_handle_get_port_info() where we need to manage/safe the Physical
> Link State, but we also need to know when the Logical VI State has
> changed and pass that back up to the OS-dependent Driver routine
> t4_os_link_changed() which is concerned about the Logical Interface.
> 
> So we enable a VI and that causes Firmware to send us a new Port
> Information message, but if none of the Physical Link State
> particulars have changed, we don't call t4_os_link_changed().
> 
> This fix uses the existing OS Contract APIs for the Common Code to
> inform the OS-dependent portion of the Host Driver when the "Link" (really
> Logical Network Interface) is "up" or "down". A new API
> t4_enable_pi_params() is added which calls t4_enable_vi_params() and,
> if that is successful, then calls back to the OS Contract API
> t4_os_link_changed() notifying the OS-dependent layer of the
> potential Link State change.
> 
> Original Work by : Casey Leedom 
> 
> Signed-off-by: Santosh Rastapur 
> Signed-off-by: Arjun Vynipadath 
> Signed-off-by: Ganesh Goudar 

Applied, thanks.


Re: [PATCH] net: netsec: reduce DMA mask to 40 bits

2018-05-25 Thread Jassi Brar
On 25 May 2018 at 18:20, Ard Biesheuvel  wrote:
> The netsec network controller IP can drive 64 address bits for DMA, and
> the DMA mask is set accordingly in the driver. However, the SynQuacer
> SoC, which is the only silicon incorporating this IP at the moment,
> integrates this IP in a manner that leaves address bits [63:40]
> unconnected.
>
> Up until now, this has not resulted in any problems, given that the DDR
> controller doesn't decode those bits to begin with. However, recent
> firmware updates for platforms incorporating this SoC allow the IOMMU
> to be enabled, which does decode address bits [47:40], and allocates
> top down from the IOVA space, producing DMA addresses that have bits
> set that have been left unconnected.
>
> Both the DT and ACPI (IORT) descriptions of the platform take this into
> account, and only describe a DMA address space of 40 bits (using either
> dma-ranges DT properties, or DMA address limits in IORT named component
> nodes). However, even though our IOMMU and bus layers may take such
> limitations into account by setting a narrower DMA mask when creating
> the platform device, the netsec probe() entrypoint follows the common
> practice of setting the DMA mask uncondionally, according to the
> capabilities of the IP block itself rather than to its integration into
> the chip.
>
> It is currently unclear what the correct fix is here. We could hack around
> it by only setting the DMA mask if it deviates from its default value of
> DMA_BIT_MASK(32). However, this makes it impossible for the bus layer to
> use DMA_BIT_MASK(32) as the bus limit, and so it appears that a more
> comprehensive approach is required to take DMA limits imposed by the
> SoC as a whole into account.
>
> In the mean time, let's limit the DMA mask to 40 bits. Given that there
> is currently only one SoC that incorporates this IP, this is a reasonable
> approach that can be backported to -stable and buys us some time to come
> up with a proper fix going forward.
>
I am sure you already thought about it, but why not let the platform
specify the bit mask for the driver (via some "bus-width" property),
to override the default 64 bit mask?

Cheers!


Re: [PATCH net-next] cxgb4: clean up init_one

2018-05-25 Thread David Miller
From: Ganesh Goudar 
Date: Thu, 24 May 2018 18:32:15 +0530

> clean up init_one and use chip_ver consistently throughout
> init_one() for chip version.
> 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied, thanks.


Re: [PATCH net-next v2] cxgb4/cxgb4vf: link management changes for new SFP

2018-05-25 Thread David Miller
From: Ganesh Goudar 
Date: Thu, 24 May 2018 17:49:30 +0530

> newer SFPs like SFP28 and QSFP28 Transceiver Modules present
> several new possibilities which we haven't faced before. Fix the
> assumptions in the code reflecting the more limited capabilities
> of previous Transceiver Module systems
> 
> Original work by Casey Leedom 
> 
> Signed-off-by: Ganesh Goudar 
> ---
> V2: Was not getting applied on net-next, respining on net-next

Applied, thank you.


Re: [PATCH 0/4] pull request for net: batman-adv 2018-05-24

2018-05-25 Thread David Miller
From: Simon Wunderlich 
Date: Thu, 24 May 2018 13:53:21 +0200

> here are a couple of bugfixes which we would like to have integrated into net.
> 
> Please pull or let me know of any problem!

Looks good, pulled, thanks Simon.


Re: [PATCH net-next] net: fec: remove stale comment

2018-05-25 Thread David Miller
From: YueHaibing 
Date: Thu, 24 May 2018 19:27:07 +0800

> This comment is outdated as fec_ptp_ioctl has been replaced by 
> fec_ptp_set/fec_ptp_get 
> since commit 1d5244d0e43b ("fec: Implement the SIOCGHWTSTAMP ioctl")
> 
> Signed-off-by: YueHaibing 

Applied, thank you.


Re: [PATCH v2 net-next] sfc: stop the TX queue before pushing new buffers

2018-05-25 Thread David Miller
From: Martin Habets 
Date: Thu, 24 May 2018 10:14:00 +0100

> efx_enqueue_skb() can push new buffers for the xmit_more functionality.
> We must stops the TX queue before this or else the TX queue does not get
> restarted and we get a netdev watchdog.
> 
> In the error handling we may now need to unwind more than 1 packet, and
> we may need to push the new buffers onto the partner queue.
> 
> v2: In the error leg also push this queue if xmit_more is set
> 
> Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2")
> Reported-by: Jarod Wilson 
> Tested-by: Jarod Wilson 
> Signed-off-by: Martin Habets 
> ---
> 
> Dave, could you please also queue this patch up for stable?

Applied to net-next.

As per -stable, only patches that go into my 'net' tree may be proposed
for -stable.


Re: [PATCH v2] ath6kl: mark expected switch fall-throughs

2018-05-25 Thread Gustavo A. R. Silva



On 05/25/2018 01:27 PM, Steve deRosier wrote:

On Fri, May 25, 2018 at 11:23 AM Gustavo A. R. Silva

wrote:


In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.



Signed-off-by: Gustavo A. R. Silva 
---
Changes in v2:
- Place code comments on a line of their own.



drivers/net/wireless/ath/ath6kl/cfg80211.c | 3 +++
1 file changed, 3 insertions(+)



diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c

b/drivers/net/wireless/ath/ath6kl/cfg80211.c

index 2ba8cf3..a16ee5d 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -3899,16 +3899,19 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
   switch (ar->hw.cap) {
   case WMI_11AN_CAP:
   ht = true;
+   /* fall through */
   case WMI_11A_CAP:
   band_5gig = true;
   break;
   case WMI_11GN_CAP:
   ht = true;
+   /* fall through */
   case WMI_11G_CAP:
   band_2gig = true;
   break;
   case WMI_11AGN_CAP:
   ht = true;
+   /* fall through */
   case WMI_11AG_CAP:
   band_2gig = true;
   band_5gig = true;
--
2.7.4



Gustavo,

Thanks for the adjustment.  It now looks good to me.



Glad to help. :)


Reviewed-by: Steve deRosier 


Thanks
--
Gustavo


[PATCH net-next 02/12] net: hns3: Add support for tx_accept_tag2 and tx_accept_untag2 config

2018-05-25 Thread Salil Mehta
From: Peng Li 

HNS3 Hardware can support up to two VLAN tags in transmit leg, the PPP
module can handle the packets based on the tag1 and tag2 config. This
patch adds support for tag2 config for vlan handling

Signed-off-by: Peng Li 
Signed-off-by: Salil Mehta 
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  7 --
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 26 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h|  6 +++--
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index ee3cbac..3fa08f7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -704,11 +704,14 @@ struct hclge_vlan_filter_vf_cfg_cmd {
u8  vf_bitmap[16];
 };
 
-#define HCLGE_ACCEPT_TAG_B 0
-#define HCLGE_ACCEPT_UNTAG_B   1
+#define HCLGE_ACCEPT_TAG1_B0
+#define HCLGE_ACCEPT_UNTAG1_B  1
 #define HCLGE_PORT_INS_TAG1_EN_B   2
 #define HCLGE_PORT_INS_TAG2_EN_B   3
 #define HCLGE_CFG_NIC_ROCE_SEL_B   4
+#define HCLGE_ACCEPT_TAG2_B5
+#define HCLGE_ACCEPT_UNTAG2_B  6
+
 struct hclge_vport_vtag_tx_cfg_cmd {
u8 vport_vlan_cfg;
u8 vf_offset;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2f0bbb6..c0b8d5a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4687,10 +4687,14 @@ static int hclge_set_vlan_tx_offload_cfg(struct 
hclge_vport *vport)
req = (struct hclge_vport_vtag_tx_cfg_cmd *)desc.data;
req->def_vlan_tag1 = cpu_to_le16(vcfg->default_tag1);
req->def_vlan_tag2 = cpu_to_le16(vcfg->default_tag2);
-   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG_B,
-vcfg->accept_tag ? 1 : 0);
-   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG_B,
-vcfg->accept_untag ? 1 : 0);
+   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG1_B,
+   vcfg->accept_tag1 ? 1 : 0);
+   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG1_B,
+   vcfg->accept_untag1 ? 1 : 0);
+   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG2_B,
+   vcfg->accept_tag2 ? 1 : 0);
+   hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG2_B,
+   vcfg->accept_untag2 ? 1 : 0);
hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG1_EN_B,
 vcfg->insert_tag1_en ? 1 : 0);
hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG2_EN_B,
@@ -4814,8 +4818,18 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 
for (i = 0; i < hdev->num_alloc_vport; i++) {
vport = >vport[i];
-   vport->txvlan_cfg.accept_tag = true;
-   vport->txvlan_cfg.accept_untag = true;
+   vport->txvlan_cfg.accept_tag1 = true;
+   vport->txvlan_cfg.accept_untag1 = true;
+
+   /* accept_tag2 and accept_untag2 are not supported on
+* pdev revision(0x20), new revision support them. The
+* value of this two fields will not return error when driver
+* send command to fireware in revision(0x20).
+* This two fields can not configured by user.
+*/
+   vport->txvlan_cfg.accept_tag2 = true;
+   vport->txvlan_cfg.accept_untag2 = true;
+
vport->txvlan_cfg.insert_tag1_en = false;
vport->txvlan_cfg.insert_tag2_en = false;
vport->txvlan_cfg.default_tag1 = 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 93177d9..677f1e4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -570,8 +570,10 @@ struct hclge_dev {
 
 /* VPort level vlan tag configuration for TX direction */
 struct hclge_tx_vtag_cfg {
-   bool accept_tag;/* Whether accept tagged packet from host */
-   bool accept_untag;  /* Whether accept untagged packet from host */
+   bool accept_tag1;   /* Whether accept tag1 packet from host */
+   bool accept_untag1; /* Whether accept untag1 packet from host */
+   bool accept_tag2;
+   bool accept_untag2;
bool insert_tag1_en;/* Whether insert inner vlan tag */
bool insert_tag2_en;/* Whether insert outer vlan tag */
u16  default_tag1;  /* The default inner vlan tag to insert */
-- 
2.7.4




[PATCH net-next 03/12] net: hns3: Add STRP_TAGP field support for hardware revision 0x21

2018-05-25 Thread Salil Mehta
From: Peng Li 

Hardware Revision(0x21) Buffer Descriptor adds a field STRP_TAGP
for vlan stripped processed indication. STRP_TAGP field has 2 bits,
bit 0 is stripped indication of the vlan tag in outer vlan tag
field, bit 1 is stripped indication of the vlan tag in inner vlan
tag field. For each bit, 0 indicates the tag is not stripped and
1 indicates the tag is stripped.

This patch adds STRP_TAGP support for revision(0x21), and does not
change the revision(0x20) action.

Signed-off-by: Peng Li 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 42 ++---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h |  3 ++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ae8d749..1bcb676 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2066,6 +2066,39 @@ static void hns3_rx_skb(struct hns3_enet_ring *ring, 
struct sk_buff *skb)
napi_gro_receive(>tqp_vector->napi, skb);
 }
 
+static u16 hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
+  struct hns3_desc *desc, u32 l234info)
+{
+   struct pci_dev *pdev = ring->tqp->handle->pdev;
+   u16 vlan_tag;
+
+   if (pdev->revision == 0x20) {
+   vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+   if (!(vlan_tag & VLAN_VID_MASK))
+   vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+
+   return vlan_tag;
+   }
+
+#define HNS3_STRP_OUTER_VLAN   0x1
+#define HNS3_STRP_INNER_VLAN   0x2
+
+   switch (hnae_get_field(l234info, HNS3_RXD_STRP_TAGP_M,
+  HNS3_RXD_STRP_TAGP_S)) {
+   case HNS3_STRP_OUTER_VLAN:
+   vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+   break;
+   case HNS3_STRP_INNER_VLAN:
+   vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+   break;
+   default:
+   vlan_tag = 0;
+   break;
+   }
+
+   return vlan_tag;
+}
+
 static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 struct sk_buff **out_skb, int *out_bnum)
 {
@@ -2155,6 +2188,9 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
}
 
*out_bnum = bnum;
+
+   l234info = le32_to_cpu(desc->rx.l234_info);
+
/* Based on hw strategy, the tag offloaded will be stored at
 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
 * in one layer tag case.
@@ -2162,17 +2198,13 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring 
*ring,
if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
u16 vlan_tag;
 
-   vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
-   if (!(vlan_tag & VLAN_VID_MASK))
-   vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+   vlan_tag = hns3_parse_vlan_tag(ring, desc, l234info);
if (vlan_tag & VLAN_VID_MASK)
__vlan_hwaccel_put_tag(skb,
   htons(ETH_P_8021Q),
   vlan_tag);
}
 
-   l234info = le32_to_cpu(desc->rx.l234_info);
-
if (unlikely(!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
netdev_err(netdev, "no valid bd,%016llx,%016llx\n",
   ((u64 *)desc)[0], ((u64 *)desc)[1]);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 5b40f5a..38e91ca 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -104,6 +104,9 @@ enum hns3_nic_state {
 #define HNS3_RXD_L4ID_S8
 #define HNS3_RXD_L4ID_M(0xf << HNS3_RXD_L4ID_S)
 #define HNS3_RXD_FRAG_B12
+#define HNS3_RXD_STRP_TAGP_S   13
+#define HNS3_RXD_STRP_TAGP_M   (0x3 << HNS3_RXD_STRP_TAGP_S)
+
 #define HNS3_RXD_L2E_B 16
 #define HNS3_RXD_L3E_B 17
 #define HNS3_RXD_L4E_B 18
-- 
2.7.4




Re: [PATCH net-next] net:sched: add action inheritdsfield to skbmod

2018-05-25 Thread Cong Wang
On Thu, May 24, 2018 at 10:45 PM, Fu, Qiaobin  wrote:
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->priority. This enables
> later classification of packets based on the DS field.

Please move it to skbedit.


  1   2   3   >