Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM

2018-01-08 Thread Jakub Kicinski
On Tue, 9 Jan 2018 09:30:05 +0200
Tariq Toukan  wrote:
> >> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> >> index 50a79203043b..9ea7cd52fde0 100644
> >> --- a/net/core/ethtool.c
> >> +++ b/net/core/ethtool.c
> >> @@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct
> >> net_device *dev, void __user *useraddr) 
> >>   static int ethtool_set_ringparam(struct net_device *dev, void
> >> __user *useraddr) {
> >> -  struct ethtool_ringparam ringparam;
> >> +  struct ethtool_ringparam ringparam, max = { .cmd =
> >> ETHTOOL_GRINGPARAM }; 
> >> -  if (!dev->ethtool_ops->set_ringparam)
> >> +  if (!dev->ethtool_ops->set_ringparam
> >> || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP;
> >>   
> >>if (copy_from_user(&ringparam, useraddr,
> >> sizeof(ringparam))) return -EFAULT;
> >>   
> >> +  dev->ethtool_ops->get_ringparam(dev, &max);  
> > 
> > Perhaps check the return value here?  It's pretty unlikely but
> > get_ringparam may fail.
> >   
> 
> get_ringparam NDO returns void.

Ah, you're right, I looked at the return of ethtool_get_ringparam().


[patch iproute2 v7 2/2] tc: Add batchsize feature for filter and actions

2018-01-08 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, at most 128
commands can be accumulated before sending to kernel.

Now it only works for the following successive commands:
filter and actions add/delete/change/replace.

Signed-off-by: Chris Mi 
---
 tc/m_action.c  |  60 +--
 tc/tc.c| 127 ++---
 tc/tc_common.h |   5 ++-
 tc/tc_filter.c |  97 +--
 4 files changed, 210 insertions(+), 79 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..e5c53a80 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -546,40 +546,56 @@ bad_val:
return ret;
 }
 
+struct tc_action_req {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+};
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   void *buf)
 {
-   int argc = *argc_p;
+   struct tc_action_req *req, action_req;
char **argv = *argv_p;
+   struct rtattr *tail;
+   int argc = *argc_p;
+   struct iovec iov;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
-   };
-   struct rtattr *tail = NLMSG_TAIL(&req.n);
+
+   if (buf)
+   req = buf;
+   else
+   req = &action_req;
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(&req->n);
 
argc -= 1;
argv += 1;
-   if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
+   if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
 
-   if (rtnl_talk(&rth, &req.n, NULL) < 0) {
+   iov.iov_base = &req->n;
+   iov.iov_len = req->n.nlmsg_len;
+
+   if (buf)
+   return 0;
+
+   if (rtnl_talk_iov(&rth, &iov, 1, NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +695,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, void *buf)
 {
 
int ret = 0;
@@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   &argc, &argv);
+   &argc, &argv, buf);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  &argc, &argv);
+  &argc, &argv, buf);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..f32e4978 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -193,16 +193,16 @@ static void usage(void)
"-nm | -nam[es] | { -cf | -conf } 
path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, void *buf)
 {
if (matches(*argv, "qdisc") == 0)
return do_qdisc(argc-1, argv+1);
if (matches(*argv, "class") == 0)
return do_class(argc-1, argv+1);
if (matches(*argv, "filter") == 0)
-   return do_filter(argc-1, argv+1);
+   return do_filter(argc-1, argv+1, buf);
if (matches(*argv, "actions") == 0)
-   return do_action(argc-1, argv+1);
+   return do_action(argc-1, argv+1, buf);
if (matches(*argv, "monitor") == 0)
return do_tcmonitor(argc-1, argv+1);
if (matches(*argv, "exec") == 0)
@@ -217,11 +21

Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM

2018-01-08 Thread Tariq Toukan



On 09/01/2018 4:23 AM, Jakub Kicinski wrote:

On Mon,  8 Jan 2018 16:00:24 +0200, Tariq Toukan wrote:

From: Eugenia Emantayev 

Add a sanity check to ensure that all requested ring parameters
are within bounds, which should reduce errors in driver implementation.


(y)


Signed-off-by: Eugenia Emantayev 
Signed-off-by: Tariq Toukan 
---
  net/core/ethtool.c | 13 +++--
  1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 50a79203043b..9ea7cd52fde0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct net_device 
*dev, void __user *useraddr)
  
  static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)

  {
-   struct ethtool_ringparam ringparam;
+   struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
  
-	if (!dev->ethtool_ops->set_ringparam)

+   if (!dev->ethtool_ops->set_ringparam || 
!dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
  
  	if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))

return -EFAULT;
  
+	dev->ethtool_ops->get_ringparam(dev, &max);


Perhaps check the return value here?  It's pretty unlikely but
get_ringparam may fail.



get_ringparam NDO returns void.


+   /* ensure new ring parameters are within the maximums */
+   if (ringparam.rx_pending > max.rx_max_pending ||
+   ringparam.rx_mini_pending > max.rx_mini_max_pending ||
+   ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
+   ringparam.tx_pending > max.tx_max_pending)
+   return -EINVAL;
+
return dev->ethtool_ops->set_ringparam(dev, &ringparam);
  }
  


Re: [PATCH] ath9k: add a quirk to set use_msi automatically

2018-01-08 Thread Kalle Valo
AceLan Kao  writes:

> Some platform(BIOS) blocks legacy interrupts (INTx), and only allows MSI
> for WLAN device. So adding a quirk to list those machines and set
> use_msi automatically.
> Adding the following platforms to the quirk.
>Dell Inspiron 24-3460
>Dell Inspiron 3472
>Dell Inspiron 14-3473
>Dell Vostro 3262
>Dell Vostro 15-3572
>
> Signed-off-by: AceLan Kao 

[...]

> @@ -96,6 +97,56 @@ static const struct ieee80211_tpt_blink ath9k_tpt_blink[] 
> = {
>  };
>  #endif
>  
> +static int __init set_use_msi(const struct dmi_system_id *dmi)
> +{
> + ath9k_use_msi = 1;
> + return 1;
> +}
> +
> +static const struct dmi_system_id ath9k_quirks[] __initconst = {
> + {
> + .callback = set_use_msi,
> + .ident = "Dell Inspiron 24-3460",
> + .matches = {
> + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
> + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 24-3460"),
> + },
> + },

Larry, didn't rtlwifi have similar situation that with certain laptops
users were required to enable a module parameter to get the device
working? I think rtlwifi should do the same as AceLan does here as then
the user would not need to manually set the module parameter.

-- 
Kalle Valo


Re: [PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code

2018-01-08 Thread Saeed Mahameed



On 01/08/2018 11:06 PM, Saeed Mahameed wrote:



On 01/08/2018 10:13 PM, Andy Gospodarek wrote:

From: Andy Gospodarek 

Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and
NET_DIM, respectively, in code that handles dynamic interrupt
moderation.  Also change all references from 'am' to 'dim' when used as
local variables and add generic profile references.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
  drivers/net/ethernet/mellanox/mlx5/core/en.h   |   9 +-
  drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  14 +-
  .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   6 +-
  drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  40 ++-
  drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   8 +-
  drivers/net/ethernet/mellanox/mlx5/core/net_dim.c  | 286 
++---

  drivers/net/ethernet/mellanox/mlx5/core/net_dim.h  |  63 ++---
  7 files changed, 225 insertions(+), 201 deletions(-)



[...]


  #define IS_SIGNIFICANT_DIFF(val, ref) \
  (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% 
difference */

-static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr,
-  struct mlx5e_rx_am_stats *prev)
+static int net_dim_stats_compare(struct net_dim_stats *curr,
+ struct net_dim_stats *prev)
  {
  if (!prev->bpms)
-    return curr->bpms ? MLX5E_AM_STATS_BETTER :
-    MLX5E_AM_STATS_SAME;
+    return curr->bpms ? NET_DIM_STATS_BETTER :
+    NET_DIM_STATS_SAME;
  if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
-    return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER :
-   MLX5E_AM_STATS_WORSE;
+    return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER :
+   NET_DIM_STATS_WORSE;


Hey Andy,

I am currently reviewing a patch internally that fixes a bug in this 
area, prev->ppms can be 0 and could cause IS_SIGNIFICANT_DIFF ouch !


I meant cause division by 0 in "IS_SIGNIFICANT_DIFF"

same goes for prev->eppm, for some reason we had a broken assumption 
that if ppms is 0 for some reason then the bpms is 0 and the above 
condition will cover us.


Anyway the patch will go to net, which means when this series gets 
accepted then net-next will fail to merge with net and we need to 
manually push the fix to the new DIM library.


But for now I don't think anything is required for this series other 
than bringing this division by 0 issue and the future merge conflict to 
your attention.



  if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-    return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER :
-   MLX5E_AM_STATS_WORSE;
+    return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER :
+   NET_DIM_STATS_WORSE;
  if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
-    return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER :
-   MLX5E_AM_STATS_WORSE;
+    return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER :
+   NET_DIM_STATS_WORSE;
-    return MLX5E_AM_STATS_SAME;
+    return NET_DIM_STATS_SAME;
  }


Re: [PATCH net-next v3 09/10] bnxt_en: add support for software dynamic interrupt moderation

2018-01-08 Thread Michael Chan
On Mon, Jan 8, 2018 at 10:13 PM, Andy Gospodarek  wrote:
> From: Andy Gospodarek 
>
> This implements the changes needed for the bnxt_en driver to add support
> for dynamic interrupt moderation per ring.
>
> This does add additional counters in the receive path, but testing shows
> that any additional instructions are offset by throughput gain when the
> default configuration is for low latency.
>
> Signed-off-by: Andy Gospodarek 
> Cc: Michael Chan 

Acked-by: Michael Chan 


Re: [PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code

2018-01-08 Thread Saeed Mahameed



On 01/08/2018 10:13 PM, Andy Gospodarek wrote:

From: Andy Gospodarek 

Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and
NET_DIM, respectively, in code that handles dynamic interrupt
moderation.  Also change all references from 'am' to 'dim' when used as
local variables and add generic profile references.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
  drivers/net/ethernet/mellanox/mlx5/core/en.h   |   9 +-
  drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  14 +-
  .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   6 +-
  drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  40 ++-
  drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   8 +-
  drivers/net/ethernet/mellanox/mlx5/core/net_dim.c  | 286 ++---
  drivers/net/ethernet/mellanox/mlx5/core/net_dim.h  |  63 ++---
  7 files changed, 225 insertions(+), 201 deletions(-)



[...]

  
  #define IS_SIGNIFICANT_DIFF(val, ref) \

(((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference 
*/
  
-static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr,

- struct mlx5e_rx_am_stats *prev)
+static int net_dim_stats_compare(struct net_dim_stats *curr,
+struct net_dim_stats *prev)
  {
if (!prev->bpms)
-   return curr->bpms ? MLX5E_AM_STATS_BETTER :
-   MLX5E_AM_STATS_SAME;
+   return curr->bpms ? NET_DIM_STATS_BETTER :
+   NET_DIM_STATS_SAME;
  
  	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))

-   return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER :
-  MLX5E_AM_STATS_WORSE;
+   return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER :
+  NET_DIM_STATS_WORSE;
  


Hey Andy,

I am currently reviewing a patch internally that fixes a bug in this 
area, prev->ppms can be 0 and could cause IS_SIGNIFICANT_DIFF ouch !
same goes for prev->eppm, for some reason we had a broken assumption 
that if ppms is 0 for some reason then the bpms is 0 and the above 
condition will cover us.


Anyway the patch will go to net, which means when this series gets 
accepted then net-next will fail to merge with net and we need to 
manually push the fix to the new DIM library.


But for now I don't think anything is required for this series other 
than bringing this division by 0 issue and the future merge conflict to 
your attention.



if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-   return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER :
-  MLX5E_AM_STATS_WORSE;
+   return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER :
+  NET_DIM_STATS_WORSE;
  
  	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))

-   return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER :
-  MLX5E_AM_STATS_WORSE;
+   return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER :
+  NET_DIM_STATS_WORSE;
  
-	return MLX5E_AM_STATS_SAME;

+   return NET_DIM_STATS_SAME;
  }
  


[patch iproute2 v7 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov

2018-01-08 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add two functions
rtnl_talk_msg and rtnl_talk_iov that can send multiple messages to kernel.
rtnl_talk_msg takes struct msghdr * as argument.
rtnl_talk_iov takes struct iovec * and iovlen as arguments.

Signed-off-by: Chris Mi 
---
 include/libnetlink.h |  6 
 lib/libnetlink.c | 84 
 2 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..e9a63dbc 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,12 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
+int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..ae0059f9 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,39 +581,43 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   int i, status, iovlen = m->msg_iovlen;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = &nladdr,
.msg_namelen = sizeof(nladdr),
.msg_iov = &iov,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
 
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   struct iovec *v;
+   v = &m->msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, &msg, 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
}
 
+   i = 0;
while (1) {
+   char *buf;
+next:
status = rtnl_recvmsg(rtnl->fd, &msg, &buf);
+   ++i;
 
if (status < 0)
return status;
@@ -642,7 +646,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +666,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -671,7 +678,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
errno = -err->error;
free(buf);
-   return -1;
+   return -i;
}
 
if (answer) {
@@ -698,12 +705,51 @@ static int __rtnl_talk(struct r

[patch iproute2 v7 0/2] tc: Add batchsize feature to batch mode

2018-01-08 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, at most 128
commands can be accumulated before sending to kernel.

We introduced two new functions in patch 1 to support for sending
multiple messages. In patch 2, we add this support for filter and
actions add/delete/change/replace commands.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file' exection time is:

real0m13.562s
user0m6.463s
sys 0m7.031s

The insertion rate is improved more than 10%.

v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.

v7
==
1. We can tell exactly which command fails.
2. Add a new function rtnl_talk_iov
3. Allocate the memory in function batch() instead of each client.
4. Remove option -bs.


Chris Mi (2):
  lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov
  tc: Add batchsize feature to batch mode

 include/libnetlink.h |   6 +++
 lib/libnetlink.c |  84 ++
 tc/m_action.c|  60 +++-
 tc/tc.c  | 127 +--
 tc/tc_common.h   |   5 +-
 tc/tc_filter.c   |  97 +++
 6 files changed, 281 insertions(+), 98 deletions(-)

-- 
2.14.3



Re: [PATCH net-next v3 00/10] net: create dynamic software irq moderation library

2018-01-08 Thread Tal Gilboa

On 1/9/2018 8:13 AM, Andy Gospodarek wrote:

From: Andy Gospodarek 

This converts the dynamic interrupt moderation library from the mlx5e
driver into a library so it can be used by any driver.  The penultimate
patch in this set adds support for thiw new dynamic interrupt moderation
library in the bnxt_en driver and the last patch creates an entry in the
MAINTAINERS file for this library.


thiw->this.


[PATCH] ipvlan: fix ipvlan MTU limits

2018-01-08 Thread liuqifa
From: Keefe Liu 

The MTU of ipvlan interface should not bigger than the phy device, When we
run following scripts, we will find there are some problems.
Step1:
ip link add link eth0 name ipv1 type ipvlan mode l2
ip netns add net1
ip link set dev ipv1 netns net1
Step2:
ip netns exec net1 ip link set dev ipv1 mtu 1501
RTNETLINK answers: Invalid argument
dmesg info: "ipv1: Invalid MTU 1501 requested, hw max 1500"
Step3:
ip link set dev eth0 mtu 1600
ip netns exec net1 ip link set dev ipv1 mtu 1501
RTNETLINK answers: Invalid argument
dmesg info: "ipv1: Invalid MTU 1501 requested, hw max 1500"
Step4:
ip link set dev eth0 mtu 1400
ip netns exec net1 ip link set dev ipv1 mtu 1500
The result of Step2 is we expected, but the result of Step3 and Step4
are not.

This patch set ipvlan's maximum MTU to ETH_MAX_MTU, and when we change
the ipvlan device's MTU, ipvlan_change_mtu() will make sure the new MTU
no larger than the phy device's MTU.

Signed-off-by: Keefe Liu 
---
 drivers/net/ipvlan/ipvlan_main.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 30cb803..84c007d 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -380,12 +380,24 @@ static int ipvlan_get_iflink(const struct net_device *dev)
return ipvlan->phy_dev->ifindex;
 }
 
+static int ipvlan_change_mtu(struct net_device *dev, int new_mtu)
+{
+   struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+   if (ipvlan->phy_dev->mtu < new_mtu)
+   return -EINVAL;
+
+   dev->mtu = new_mtu;
+   return 0;
+}
+
 static const struct net_device_ops ipvlan_netdev_ops = {
.ndo_init   = ipvlan_init,
.ndo_uninit = ipvlan_uninit,
.ndo_open   = ipvlan_open,
.ndo_stop   = ipvlan_stop,
.ndo_start_xmit = ipvlan_start_xmit,
+   .ndo_change_mtu = ipvlan_change_mtu,
.ndo_fix_features   = ipvlan_fix_features,
.ndo_change_rx_flags= ipvlan_change_rx_flags,
.ndo_set_rx_mode= ipvlan_set_multicast_mac_filter,
@@ -680,6 +692,8 @@ void ipvlan_link_setup(struct net_device *dev)
 {
ether_setup(dev);
 
+   dev->min_mtu = 0;
+   dev->max_mtu = ETH_MAX_MTU;
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
dev->netdev_ops = &ipvlan_netdev_ops;
-- 
1.8.3.1




RE: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: Marcelo Ricardo Leitner [mailto:marcelo.leit...@gmail.com]
> Sent: Saturday, January 6, 2018 3:15 AM
> To: David Ahern 
> Cc: Chris Mi ; netdev@vger.kernel.org;
> gerlitz...@gmail.com; step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode
> 
> On Fri, Jan 05, 2018 at 11:15:59AM -0700, David Ahern wrote:
> > On 1/4/18 12:34 AM, Chris Mi wrote:
> > > Currently in tc batch mode, only one command is read from the batch
> > > file and sent to kernel to process. With this support, we can
> > > accumulate several commands before sending to kernel.
> > >
> > > Now it only works for the following successive rules, 1. filter add
> > > 2. filter delete 3. actions add 4. actions delete
> > >
> > > Otherwise, the batch size is still 1.
> > >
> > > Signed-off-by: Chris Mi 
> > > ---
> > >  tc/m_action.c  |  93 ++--
> > >  tc/tc.c|  96 +++--
> > >  tc/tc_common.h |   8 +++-
> > >  tc/tc_filter.c | 132
> > > -
> > >  4 files changed, 252 insertions(+), 77 deletions(-)
> > >
> > > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..cf5cc95d
> > > 100644
> > > --- a/tc/m_action.c
> > > +++ b/tc/m_action.c
> > > @@ -23,6 +23,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #include "utils.h"
> > >  #include "tc_common.h"
> > > @@ -546,40 +547,86 @@ bad_val:
> > >   return ret;
> > >  }
> > >
> > > +typedef struct {
> > > + struct nlmsghdr n;
> > > + struct tcamsg   t;
> > > + charbuf[MAX_MSG];
> > > +} tc_action_req;
> > > +
> > > +static tc_action_req *action_reqs;
> > > +static struct iovec msg_iov[MSG_IOV_MAX];
> > > +
> > > +void free_action_reqs(void)
> > > +{
> > > + free(action_reqs);
> > > +}
> > > +
> > > +static tc_action_req *get_action_req(int batch_size, int index) {
> > > + tc_action_req *req;
> > > +
> > > + if (action_reqs == NULL) {
> > > + action_reqs = malloc(batch_size * sizeof (tc_action_req));
> > > + if (action_reqs == NULL)
> > > + return NULL;
> > > + }
> > > + req = &action_reqs[index];
> > > + memset(req, 0, sizeof (*req));
> > > +
> > > + return req;
> > > +}
> > > +
> > >  static int tc_action_modify(int cmd, unsigned int flags,
> > > - int *argc_p, char ***argv_p)
> > > + int *argc_p, char ***argv_p,
> > > + int batch_size, int index, bool send)
> > >  {
> > > - int argc = *argc_p;
> > > + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> > > + struct iovec *iov = &msg_iov[index];
> > >   char **argv = *argv_p;
> > > - int ret = 0;
> > > - struct {
> > > - struct nlmsghdr n;
> > > - struct tcamsg   t;
> > > - charbuf[MAX_MSG];
> > > - } req = {
> > > - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > > - .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > > - .n.nlmsg_type = cmd,
> > > - .t.tca_family = AF_UNSPEC,
> > > + struct msghdr msg = {
> > > + .msg_name = &nladdr,
> > > + .msg_namelen = sizeof(nladdr),
> > > + .msg_iov = msg_iov,
> > > + .msg_iovlen = index + 1,
> > >   };
> > > - struct rtattr *tail = NLMSG_TAIL(&req.n);
> > > + struct rtattr *tail;
> > > + tc_action_req *req;
> > > + int argc = *argc_p;
> > > + int ret = 0;
> > > +
> > > + req = get_action_req(batch_size, index);
> > > + if (req == NULL) {
> > > + fprintf(stderr, "get_action_req error: not enough buffer\n");
> > > + return -ENOMEM;
> > > + }
> > > + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > > + req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > > + req->n.nlmsg_type = cmd;
> > > + req->t.tca_family = AF_UNSPEC;
> > > + tail = NLMSG_TAIL(&req->n);
> > >
> > >   argc -= 1;
> > >   argv += 1;
> > > - if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
> > > + if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) {
> > >   fprintf(stderr, "Illegal \"action\"\n");
> > >   return -1;
> > >   }
> > > - tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
> > > + tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail;
> > >
> > > - if (rtnl_talk(&rth, &req.n, NULL) < 0) {
> > > + *argc_p = argc;
> > > + *argv_p = argv;
> > > +
> > > + iov->iov_base = &req->n;
> > > + iov->iov_len = req->n.nlmsg_len;
> > > +
> > > + if (!send)
> > > + return 0;
> > > +
> > > + if (rtnl_talk_msg(&rth, &msg, NULL) < 0) {
> > >   fprintf(stderr, "We have an error talking to the kernel\n");
> > >   ret = -1;
> > >   }
> > >
> > > - *argc_p = argc;
> > > - *argv_p = argv;
> > > -
> > >   return ret;
> > >  }
> > >
> > > @@ -679,7 +726,7 @@ bad_val:
> > >   return ret;
> > >  }
> > >
> > > -int do_action(int argc, char **argv)
> > > +int do_action(int 

[net-next 05/10] net/mlx5e: IPoIB, Use correct timestamp in child receive flow

2018-01-08 Thread Saeed Mahameed
From: Feras Daoud 

The current implementation takes the child timestamp object from
the parent since the rq in mlx5i_complete_rx_cqe belongs to the parent.
This change fixes the issue by taking the correct timestamp.

Fixes: 7e7f4780c340 ("net/mlx5e: IPoIB, Use hash-table to map between QPN to 
child netdev")
Signed-off-by: Feras Daoud 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 90354e676f0d..ff234dfefc27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1175,7 +1175,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq 
*rq,
 u32 cqe_bcnt,
 struct sk_buff *skb)
 {
+   struct hwtstamp_config *tstamp;
struct net_device *netdev;
+   struct mlx5e_priv *priv;
char *pseudo_header;
u32 qpn;
u8 *dgid;
@@ -1194,6 +1196,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq 
*rq,
return;
}
 
+   priv = mlx5i_epriv(netdev);
+   tstamp = &priv->tstamp;
+
g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET;
if ((!g) || dgid[0] != 0xff)
@@ -1214,7 +1219,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq 
*rq,
skb->ip_summed = CHECKSUM_COMPLETE;
skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
 
-   if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+   if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
skb_hwtstamps(skb)->hwtstamp =
mlx5_timecounter_cyc2time(rq->clock, 
get_cqe_ts(cqe));
 
-- 
2.13.0



[net-next 02/10] net/mlx5: Hairpin pair core object setup

2018-01-08 Thread Saeed Mahameed
From: Or Gerlitz 

Low level code to setup hairpin pair core object, deals with:
 - create hairpin RQs/SQs
 - destroy hairpin RQs/SQs
 - modifying hairpin RQs/SQs - pairing (rst2rdy) and unpairing (rdy2rst)

Unlike conventional RQs/SQs, the memory used for the packet and descriptor
buffers is allocated by the firmware and not the driver. The driver sets
the overall data size (log).

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 184 +
 include/linux/mlx5/transobj.h  |  19 +++
 2 files changed, 203 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c 
b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index 5e128d7a9ffd..a09ebbaf3b68 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -398,3 +398,187 @@ void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 
rqtn)
mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 EXPORT_SYMBOL(mlx5_core_destroy_rqt);
+
+static int mlx5_hairpin_create_rq(struct mlx5_core_dev *mdev,
+ struct mlx5_hairpin_params *params, u32 *rqn)
+{
+   u32 in[MLX5_ST_SZ_DW(create_rq_in)] = {0};
+   void *rqc, *wq;
+
+   rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+   wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+   MLX5_SET(rqc, rqc, hairpin, 1);
+   MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+   MLX5_SET(rqc, rqc, counter_set_id, params->q_counter);
+
+   MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+
+   return mlx5_core_create_rq(mdev, in, MLX5_ST_SZ_BYTES(create_rq_in), 
rqn);
+}
+
+static int mlx5_hairpin_create_sq(struct mlx5_core_dev *mdev,
+ struct mlx5_hairpin_params *params, u32 *sqn)
+{
+   u32 in[MLX5_ST_SZ_DW(create_sq_in)] = {0};
+   void *sqc, *wq;
+
+   sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+   wq  = MLX5_ADDR_OF(sqc, sqc, wq);
+
+   MLX5_SET(sqc, sqc, hairpin, 1);
+   MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+
+   MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+
+   return mlx5_core_create_sq(mdev, in, MLX5_ST_SZ_BYTES(create_sq_in), 
sqn);
+}
+
+static int mlx5_hairpin_create_queues(struct mlx5_hairpin *hp,
+ struct mlx5_hairpin_params *params)
+{
+   int err;
+
+   err = mlx5_hairpin_create_rq(hp->func_mdev, params, &hp->rqn);
+   if (err)
+   goto out_err_rq;
+
+   err = mlx5_hairpin_create_sq(hp->peer_mdev, params, &hp->sqn);
+   if (err)
+   goto out_err_sq;
+
+   return 0;
+
+out_err_sq:
+   mlx5_core_destroy_rq(hp->func_mdev, hp->rqn);
+out_err_rq:
+   return err;
+}
+
+static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp)
+{
+   mlx5_core_destroy_rq(hp->func_mdev, hp->rqn);
+   mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn);
+}
+
+static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn,
+ int curr_state, int next_state,
+ u16 peer_vhca, u32 peer_sq)
+{
+   u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0};
+   void *rqc;
+
+   rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+   if (next_state == MLX5_RQC_STATE_RDY) {
+   MLX5_SET(rqc, rqc, hairpin_peer_sq, peer_sq);
+   MLX5_SET(rqc, rqc, hairpin_peer_vhca, peer_vhca);
+   }
+
+   MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+   MLX5_SET(rqc, rqc, state, next_state);
+
+   return mlx5_core_modify_rq(func_mdev, rqn,
+  in, MLX5_ST_SZ_BYTES(modify_rq_in));
+}
+
+static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn,
+ int curr_state, int next_state,
+ u16 peer_vhca, u32 peer_rq)
+{
+   u32 in[MLX5_ST_SZ_DW(modify_sq_in)] = {0};
+   void *sqc;
+
+   sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+   if (next_state == MLX5_RQC_STATE_RDY) {
+   MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq);
+   MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca);
+   }
+
+   MLX5_SET(modify_sq_in, in, sq_state, curr_state);
+   MLX5_SET(sqc, sqc, state, next_state);
+
+   return mlx5_core_modify_sq(peer_mdev, sqn,
+  in, MLX5_ST_SZ_BYTES(modify_sq_in));
+}
+
+static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp)
+{
+   int err;
+
+   /* set peer SQ */
+   err = mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn,
+MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY,
+MLX5_CAP_GEN(hp->func_mdev, vhca_id), 
hp->rqn);
+   if (err)
+   goto err_modify_sq;
+
+   /* set func RQ */
+   err = mlx5_hairpin_modify_rq(hp->func_mdev, 

[net-next 07/10] net/mlx5e: IPoIB, Add ethtool support to get child time stamping parameters

2018-01-08 Thread Saeed Mahameed
From: Feras Daoud 

Add support to get time stamping capabilities using ethtool for
child interface.
Usage example:
ethtool -T CHILD-DEVNAME

This change reuses the functionality of parent devices and does not
introduce any new logic.

Signed-off-by: Feras Daoud 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
index 6f338a9219c8..90cb50fe17fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -254,4 +254,5 @@ const struct ethtool_ops mlx5i_ethtool_ops = {
 const struct ethtool_ops mlx5i_pkey_ethtool_ops = {
.get_drvinfo= mlx5i_get_drvinfo,
.get_link   = ethtool_op_get_link,
+   .get_ts_info= mlx5i_get_ts_info,
 };
-- 
2.13.0



[pull request][net-next 00/10] Mellanox, mlx5 updates 2018-01-08

2018-01-08 Thread Saeed Mahameed
Hi Dave,

This series includes updates for mlx5 driver, for full information please
see tag log message below.

The series doesn't cause any conflict with Andy's
"net: create dynamic software irq moderation library".  Both merge together
seamlessly.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit f4803f1b73f877a571be4c8e531dfcf190acc691:

  net: tipc: remove unused hardirq.h (2018-01-08 20:59:25 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2018-01-08

for you to fetch changes up to b8a0dbe3a90b2debd72cd9a304eacac55c44e5a4:

  net/mlx5e: E-switch, Add steering drop counters (2018-01-09 07:40:48 +0200)


mlx5-updates-2018-01-08

Four patches from Or that add Hairpin support to mlx5:
===
From:  Or Gerlitz 

We refer the ability of NIC HW to fwd packet received on one port to
the other port (also from a port to itself) as hairpin. The application API
is based
on ingress tc/flower rules set on the NIC with the mirred redirect
action. Other actions can apply to packets during the redirect.

Hairpin allows to offload the data-path of various SW DDoS gateways,
load-balancers, etc to HW. Packets go through all the required
processing in HW (header re-write, encap/decap, push/pop vlan) and
then forwarded, CPU stays at practically zero usage. HW Flow counters
are used by the control plane for monitoring and accounting.

Hairpin is implemented by pairing a receive queue (RQ) to send queue (SQ).
All the flows that share  are redirected through
the same hairpin pair. Currently, only header-rewrite is supported as a
packet modification action.

I'd like to thanks Elijah Shakkour  for implementing this
functionality
on HW simulator, before it was avail in the FW so the driver code could be
tested early.
===

>From Feras three patches that provide very small changes that allow IPoIB
to support RX timestamping for child interfaces, simply by hooking the mlx5e
timestamping PTP ioctl to IPoIB child interface netdev profile.

One patch from Gal to fix a spilling mistake.

Two patches from Eugenia adds drop counters to VF statistics
to be reported as part of VF statistics in netlink (iproute2) and
implemented them in mlx5 eswitch.


Eugenia Emantayev (2):
  net/core: Add drop counters to VF statistics
  net/mlx5e: E-switch, Add steering drop counters

Feras Daoud (3):
  net/mlx5e: IPoIB, Use correct timestamp in child receive flow
  net/mlx5e: IPoIB, Add PTP ioctl support for child interface
  net/mlx5e: IPoIB, Add ethtool support to get child time stamping 
parameters

Gal Pressman (1):
  net/mlx5e: IPoIB, Fix spelling mistake "functionts" -> "functions"

Or Gerlitz (4):
  net/mlx5: Add hairpin definitions to the FW API
  net/mlx5: Hairpin pair core object setup
  net/mlx5e: Basic setup of hairpin object
  net/mlx5e: Support offloading TC NIC hairpin flows

 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c|   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 280 -
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  99 +++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   2 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |   6 +
 .../ethernet/mellanox/mlx5/core/ipoib/ethtool.c|   1 +
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  |   3 +-
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 184 ++
 include/linux/if_link.h|   2 +
 include/linux/mlx5/mlx5_ifc.h  |  43 +++-
 include/linux/mlx5/transobj.h  |  19 ++
 include/uapi/linux/if_link.h   |   2 +
 net/core/rtnetlink.c   |  10 +-
 17 files changed, 649 insertions(+), 27 deletions(-)


[net-next 08/10] net/mlx5e: IPoIB, Fix spelling mistake "functionts" -> "functions"

2018-01-08 Thread Saeed Mahameed
From: Gal Pressman 

Fix trivial spelling mistake: "functionts" -> "functions".

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index 5e87d04652d2..6d9053bcbe95 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -76,7 +76,7 @@ int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn);
 /* Get the net-device corresponding to the given underlay QPN */
 struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn);
 
-/* Shared ndo functionts */
+/* Shared ndo functions */
 int mlx5i_dev_init(struct net_device *dev);
 void mlx5i_dev_cleanup(struct net_device *dev);
 int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
-- 
2.13.0



[net-next 04/10] net/mlx5e: Support offloading TC NIC hairpin flows

2018-01-08 Thread Saeed Mahameed
From: Or Gerlitz 

We refer to TC NIC rule that involves forwarding as "hairpin".

All hairpin rules from the current NIC device (called "func" in
the code) to a given NIC device ("peer") are steered into the
same hairpin RQ/SQ pair.

The hairpin pair is set on demand and removed when there are no
TC rules that need it.

Here's a TC rule that matches on icmp, does header re-write of the
dst mac and hairpin from RX/enp1s2f1 to TX/enp1s2f2 (enp1s2f1/2 are
two mlx5 devices):

tc filter add dev enp1s2f1 protocol ip parent : prio 2
flower skip_sw ip_proto icmp
 action pedit ex munge eth dst set 10:22:33:44:55:66 pipe
 action mirred egress redirect dev enp1s2f2

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 183 ++--
 2 files changed, 172 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5299310f2481..72bab8d3f4b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -659,6 +659,7 @@ struct mlx5e_tc_table {
struct rhashtable   ht;
 
DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+   DECLARE_HASHTABLE(hairpin_tbl, 8);
 };
 
 struct mlx5e_vlan_table {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 55a527bda2e5..cf528da51243 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -56,12 +56,14 @@ struct mlx5_nic_flow_attr {
u32 action;
u32 flow_tag;
u32 mod_hdr_id;
+   u32 hairpin_tirn;
 };
 
 enum {
MLX5E_TC_FLOW_ESWITCH   = BIT(0),
MLX5E_TC_FLOW_NIC   = BIT(1),
MLX5E_TC_FLOW_OFFLOADED = BIT(2),
+   MLX5E_TC_FLOW_HAIRPIN   = BIT(3),
 };
 
 struct mlx5e_tc_flow {
@@ -71,6 +73,7 @@ struct mlx5e_tc_flow {
struct mlx5_flow_handle *rule;
struct list_headencap;   /* flows sharing the same encap ID */
struct list_headmod_hdr; /* flows sharing the same mod hdr ID */
+   struct list_headhairpin; /* flows sharing the same hairpin */
union {
struct mlx5_esw_flow_attr esw_attr[0];
struct mlx5_nic_flow_attr nic_attr[0];
@@ -101,6 +104,17 @@ struct mlx5e_hairpin {
u32 tirn;
 };
 
+struct mlx5e_hairpin_entry {
+   /* a node of a hash table which keeps all the  hairpin entries */
+   struct hlist_node hairpin_hlist;
+
+   /* flows sharing the same hairpin */
+   struct list_head flows;
+
+   int peer_ifindex;
+   struct mlx5e_hairpin *hp;
+};
+
 struct mod_hdr_key {
int num_actions;
void *actions;
@@ -319,6 +333,98 @@ static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp)
kvfree(hp);
 }
 
+static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv,
+int peer_ifindex)
+{
+   struct mlx5e_hairpin_entry *hpe;
+
+   hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe,
+  hairpin_hlist, peer_ifindex) {
+   if (hpe->peer_ifindex == peer_ifindex)
+   return hpe;
+   }
+
+   return NULL;
+}
+
+static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
+ struct mlx5e_tc_flow *flow,
+ struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+   int peer_ifindex = parse_attr->mirred_ifindex;
+   struct mlx5_hairpin_params params;
+   struct mlx5e_hairpin_entry *hpe;
+   struct mlx5e_hairpin *hp;
+   int err;
+
+   if (!MLX5_CAP_GEN(priv->mdev, hairpin)) {
+   netdev_warn(priv->netdev, "hairpin is not supported\n");
+   return -EOPNOTSUPP;
+   }
+
+   hpe = mlx5e_hairpin_get(priv, peer_ifindex);
+   if (hpe)
+   goto attach_flow;
+
+   hpe = kzalloc(sizeof(*hpe), GFP_KERNEL);
+   if (!hpe)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(&hpe->flows);
+   hpe->peer_ifindex = peer_ifindex;
+
+   params.log_data_size = 15;
+   params.log_data_size = min_t(u8, params.log_data_size,
+MLX5_CAP_GEN(priv->mdev, 
log_max_hairpin_wq_data_sz));
+   params.log_data_size = max_t(u8, params.log_data_size,
+MLX5_CAP_GEN(priv->mdev, 
log_min_hairpin_wq_data_sz));
+   params.q_counter = priv->q_counter;
+
+   hp = mlx5e_hairpin_create(priv, ¶ms, peer_ifindex);
+   if (IS_ERR(hp)) {
+   err = PTR_ERR(hp);
+   goto create_hairpin_err;
+   }
+
+   netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x 
log data size %d\n",
+  hp->tirn, hp->pair->rqn, hp-

[net-next 03/10] net/mlx5e: Basic setup of hairpin object

2018-01-08 Thread Saeed Mahameed
From: Or Gerlitz 

Add the code to do basic setup for hairpin object which
will later serve offloading TC flows.

This includes calling the mlx5 core to create/destroy the hairpin
pair object and setting the HW transport objects that will be used
for steering matched flows to go through hairpin.

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 97 +
 1 file changed, 97 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 933275fe03b2..55a527bda2e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -93,6 +93,14 @@ enum {
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE (1 << 16)
 
+struct mlx5e_hairpin {
+   struct mlx5_hairpin *pair;
+
+   struct mlx5_core_dev *func_mdev;
+   u32 tdn;
+   u32 tirn;
+};
+
 struct mod_hdr_key {
int num_actions;
void *actions;
@@ -222,6 +230,95 @@ static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
}
 }
 
+static
+struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex)
+{
+   struct net_device *netdev;
+   struct mlx5e_priv *priv;
+
+   netdev = __dev_get_by_index(net, ifindex);
+   priv = netdev_priv(netdev);
+   return priv->mdev;
+}
+
+static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp)
+{
+   u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0};
+   void *tirc;
+   int err;
+
+   err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn);
+   if (err)
+   goto alloc_tdn_err;
+
+   tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+   MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+   MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn);
+   MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
+
+   err = mlx5_core_create_tir(hp->func_mdev, in, 
MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn);
+   if (err)
+   goto create_tir_err;
+
+   return 0;
+
+create_tir_err:
+   mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+alloc_tdn_err:
+   return err;
+}
+
+static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp)
+{
+   mlx5_core_destroy_tir(hp->func_mdev, hp->tirn);
+   mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+}
+
+static struct mlx5e_hairpin *
+mlx5e_hairpin_create(struct mlx5e_priv *priv, struct mlx5_hairpin_params 
*params,
+int peer_ifindex)
+{
+   struct mlx5_core_dev *func_mdev, *peer_mdev;
+   struct mlx5e_hairpin *hp;
+   struct mlx5_hairpin *pair;
+   int err;
+
+   hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+   if (!hp)
+   return ERR_PTR(-ENOMEM);
+
+   func_mdev = priv->mdev;
+   peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex);
+
+   pair = mlx5_core_hairpin_create(func_mdev, peer_mdev, params);
+   if (IS_ERR(pair)) {
+   err = PTR_ERR(pair);
+   goto create_pair_err;
+   }
+   hp->pair = pair;
+   hp->func_mdev = func_mdev;
+
+   err = mlx5e_hairpin_create_transport(hp);
+   if (err)
+   goto create_transport_err;
+
+   return hp;
+
+create_transport_err:
+   mlx5_core_hairpin_destroy(hp->pair);
+create_pair_err:
+   kfree(hp);
+   return ERR_PTR(err);
+}
+
+static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp)
+{
+   mlx5e_hairpin_destroy_transport(hp);
+   mlx5_core_hairpin_destroy(hp->pair);
+   kvfree(hp);
+}
+
 static struct mlx5_flow_handle *
 mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
  struct mlx5e_tc_flow_parse_attr *parse_attr,
-- 
2.13.0



[net-next 09/10] net/core: Add drop counters to VF statistics

2018-01-08 Thread Saeed Mahameed
From: Eugenia Emantayev 

Modern hardware can decide to drop packets going to/from a VF.
Add receive and transmit drop counters to be displayed at hypervisor
layer in iproute2 per VF statistics.

Signed-off-by: Eugenia Emantayev 
Signed-off-by: Saeed Mahameed 
---
 include/linux/if_link.h  |  2 ++
 include/uapi/linux/if_link.h |  2 ++
 net/core/rtnetlink.c | 10 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 4c54611e03e9..622658dfbf0a 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -13,6 +13,8 @@ struct ifla_vf_stats {
__u64 tx_bytes;
__u64 broadcast;
__u64 multicast;
+   __u64 rx_dropped;
+   __u64 tx_dropped;
 };
 
 struct ifla_vf_info {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 19fc02660e0c..f8f04fed6186 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -732,6 +732,8 @@ enum {
IFLA_VF_STATS_BROADCAST,
IFLA_VF_STATS_MULTICAST,
IFLA_VF_STATS_PAD,
+   IFLA_VF_STATS_RX_DROPPED,
+   IFLA_VF_STATS_TX_DROPPED,
__IFLA_VF_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c688dc564b11..5421a3fd3ba1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device 
*dev,
 nla_total_size_64bit(sizeof(__u64)) +
 /* IFLA_VF_STATS_MULTICAST */
 nla_total_size_64bit(sizeof(__u64)) +
+/* IFLA_VF_STATS_RX_DROPPED */
+nla_total_size_64bit(sizeof(__u64)) +
+/* IFLA_VF_STATS_TX_DROPPED */
+nla_total_size_64bit(sizeof(__u64)) +
 nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
@@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct 
sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
  vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+   nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+   nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
nla_nest_cancel(skb, vfstats);
goto nla_put_vf_failure;
}
-- 
2.13.0



[net-next 06/10] net/mlx5e: IPoIB, Add PTP ioctl support for child interface

2018-01-08 Thread Saeed Mahameed
From: Feras Daoud 

Add support to control precision time protocol on child interfaces
using ioctl.

This commit changes the following:
- Change parent ioctl function to be non static
- Reuse the parent ioctl function in child devices

Signed-off-by: Feras Daoud 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  | 3 +--
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c | 7 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 8812d7208e8f..3b2363e93ba5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -41,7 +41,6 @@
 static int mlx5i_open(struct net_device *netdev);
 static int mlx5i_close(struct net_device *netdev);
 static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu);
-static int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 
 static const struct net_device_ops mlx5i_netdev_ops = {
.ndo_open= mlx5i_open,
@@ -396,7 +395,7 @@ int mlx5i_dev_init(struct net_device *dev)
return 0;
 }
 
-static int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
struct mlx5e_priv *priv = mlx5i_epriv(dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index 49008022c306..5e87d04652d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -79,6 +79,7 @@ struct net_device *mlx5i_pkey_get_netdev(struct net_device 
*netdev, u32 qpn);
 /* Shared ndo functionts */
 int mlx5i_dev_init(struct net_device *dev);
 void mlx5i_dev_cleanup(struct net_device *dev);
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 
 /* Parent profile functions */
 void mlx5i_init(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
index 531b02cc979b..b69e9d847a6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -140,6 +140,7 @@ static int mlx5i_pkey_close(struct net_device *netdev);
 static int mlx5i_pkey_dev_init(struct net_device *dev);
 static void mlx5i_pkey_dev_cleanup(struct net_device *netdev);
 static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu);
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int 
cmd);
 
 static const struct net_device_ops mlx5i_pkey_netdev_ops = {
.ndo_open= mlx5i_pkey_open,
@@ -147,6 +148,7 @@ static const struct net_device_ops mlx5i_pkey_netdev_ops = {
.ndo_init= mlx5i_pkey_dev_init,
.ndo_uninit  = mlx5i_pkey_dev_cleanup,
.ndo_change_mtu  = mlx5i_pkey_change_mtu,
+   .ndo_do_ioctl= mlx5i_pkey_ioctl,
 };
 
 /* Child NDOs */
@@ -174,6 +176,11 @@ static int mlx5i_pkey_dev_init(struct net_device *dev)
return mlx5i_dev_init(dev);
 }
 
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+   return mlx5i_ioctl(dev, ifr, cmd);
+}
+
 static void mlx5i_pkey_dev_cleanup(struct net_device *netdev)
 {
return mlx5i_dev_cleanup(netdev);
-- 
2.13.0



[net-next 10/10] net/mlx5e: E-switch, Add steering drop counters

2018-01-08 Thread Saeed Mahameed
From: Eugenia Emantayev 

Add flow counters to count packets dropped due to drop rules
configured in eswitch egress and ingress ACLs.
These counters will count VFs violations and incoming traffic drops.
Will be presented on hypervisor via standard 'ip -s link show' command.

Example: "ip -s link show dev enp5s0f0"

6: enp5s0f0:  mtu 1500 qdisc mq state UP mode 
DEFAULT group default qlen 1000
link/ether 24:8a:07:a5:28:f0 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast
0  00   0   0   2
TX: bytes  packets  errors  dropped carrier collsns
1406   17   0   0   0   0
vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, 
trust off, query_rss off
RX: bytes  packets  mcast   bcast   dropped
1666   29   14 32  0
TX: bytes  packets   dropped
2880   44   2412

Signed-off-by: Eugenia Emantayev 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 99 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  7 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  2 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |  6 ++
 4 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 7649e36653d9..5ecf2cddc16d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -37,6 +37,7 @@
 #include 
 #include "mlx5_core.h"
 #include "eswitch.h"
+#include "fs_core.h"
 
 #define UPLINK_VPORT 0x
 
@@ -1123,8 +1124,12 @@ static void esw_vport_disable_ingress_acl(struct 
mlx5_eswitch *esw,
 static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
 {
+   struct mlx5_fc *counter = vport->ingress.drop_counter;
+   struct mlx5_flow_destination drop_ctr_dst = {0};
+   struct mlx5_flow_destination *dst = NULL;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
+   int dest_num = 0;
int err = 0;
u8 *smac_v;
 
@@ -1188,9 +1193,18 @@ static int esw_vport_ingress_config(struct mlx5_eswitch 
*esw,
 
memset(spec, 0, sizeof(*spec));
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+   /* Attach drop flow counter */
+   if (counter) {
+   flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+   drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+   drop_ctr_dst.counter = counter;
+   dst = &drop_ctr_dst;
+   dest_num++;
+   }
vport->ingress.drop_rule =
mlx5_add_flow_rules(vport->ingress.acl, spec,
-   &flow_act, NULL, 0);
+   &flow_act, dst, dest_num);
if (IS_ERR(vport->ingress.drop_rule)) {
err = PTR_ERR(vport->ingress.drop_rule);
esw_warn(esw->dev,
@@ -1210,8 +1224,12 @@ static int esw_vport_ingress_config(struct mlx5_eswitch 
*esw,
 static int esw_vport_egress_config(struct mlx5_eswitch *esw,
   struct mlx5_vport *vport)
 {
+   struct mlx5_fc *counter = vport->egress.drop_counter;
+   struct mlx5_flow_destination drop_ctr_dst = {0};
+   struct mlx5_flow_destination *dst = NULL;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
+   int dest_num = 0;
int err = 0;
 
esw_vport_cleanup_egress_rules(esw, vport);
@@ -1262,9 +1280,18 @@ static int esw_vport_egress_config(struct mlx5_eswitch 
*esw,
/* Drop others rule (star rule) */
memset(spec, 0, sizeof(*spec));
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+   /* Attach egress drop flow counter */
+   if (counter) {
+   flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+   drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+   drop_ctr_dst.counter = counter;
+   dst = &drop_ctr_dst;
+   dest_num++;
+   }
vport->egress.drop_rule =
mlx5_add_flow_rules(vport->egress.acl, spec,
-   &flow_act, NULL, 0);
+   &flow_act, dst, dest_num);
if (IS_ERR(vport->egress.drop_rule)) {
err = PTR_ERR(vport->egress.drop_rule);
esw_warn(esw->dev,
@@ -1457,6 +1484,41 @@ static void esw_apply_vport_conf(struct mlx5_eswitch 
*esw,
}
 }
 
+static void esw_vport_create_drop_counters(struct mlx5_vport *vport)
+{
+   struct mlx5_core_dev *dev = vport->dev;
+
+   if (MLX5_CAP_ESW_INGRESS_ACL(dev, flow_counter)) {
+   vport->ingress.drop_counter = mlx5_fc_create(dev, false);
+   if (IS_ERR(vport->ingress.drop_counter)) {
+  

[net-next 01/10] net/mlx5: Add hairpin definitions to the FW API

2018-01-08 Thread Saeed Mahameed
From: Or Gerlitz 

Add hairpin definitions to the IFC file.

This includes the HCA ID, few HCA hairpin capabilities, new
fields in RQ/SQ used later for the pairing and the WQ hairpin
data size attribute.

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 include/linux/mlx5/mlx5_ifc.h | 43 +++
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d44ec5f41d4a..78e36fc2609e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -794,7 +794,10 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-   u8 reserved_at_0[0x80];
+   u8 reserved_at_0[0x30];
+   u8 vhca_id[0x10];
+
+   u8 reserved_at_40[0x40];
 
u8 log_max_srq_sz[0x8];
u8 log_max_qp_sz[0x8];
@@ -1023,12 +1026,19 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_3b8[0x3];
u8 log_min_stride_sz_sq[0x5];
 
-   u8 reserved_at_3c0[0x1b];
+   u8 hairpin[0x1];
+   u8 reserved_at_3c1[0x2];
+   u8 log_max_hairpin_queues[0x5];
+   u8 reserved_at_3c8[0x3];
+   u8 log_max_hairpin_wq_data_sz[0x5];
+   u8 reserved_at_3d0[0xb];
u8 log_max_wq_sz[0x5];
 
u8 nic_vport_change_event[0x1];
u8 disable_local_lb[0x1];
-   u8 reserved_at_3e2[0x9];
+   u8 reserved_at_3e2[0x1];
+   u8 log_min_hairpin_wq_data_sz[0x5];
+   u8 reserved_at_3e8[0x3];
u8 log_max_vlan_list[0x5];
u8 reserved_at_3f0[0x3];
u8 log_max_current_mc_list[0x5];
@@ -1162,7 +1172,10 @@ struct mlx5_ifc_wq_bits {
u8 reserved_at_118[0x3];
u8 log_wq_sz[0x5];
 
-   u8 reserved_at_120[0x15];
+   u8 reserved_at_120[0xb];
+   u8 log_hairpin_data_sz[0x5];
+   u8 reserved_at_130[0x5];
+
u8 log_wqe_num_of_strides[0x3];
u8 two_byte_shift_en[0x1];
u8 reserved_at_139[0x4];
@@ -2482,7 +2495,8 @@ struct mlx5_ifc_sqc_bits {
u8 state[0x4];
u8 reg_umr[0x1];
u8 allow_swp[0x1];
-   u8 reserved_at_e[0x12];
+   u8 hairpin[0x1];
+   u8 reserved_at_f[0x11];
 
u8 reserved_at_20[0x8];
u8 user_index[0x18];
@@ -2490,7 +2504,13 @@ struct mlx5_ifc_sqc_bits {
u8 reserved_at_40[0x8];
u8 cqn[0x18];
 
-   u8 reserved_at_60[0x90];
+   u8 reserved_at_60[0x8];
+   u8 hairpin_peer_rq[0x18];
+
+   u8 reserved_at_80[0x10];
+   u8 hairpin_peer_vhca[0x10];
+
+   u8 reserved_at_a0[0x50];
 
u8 packet_pacing_rate_limit_index[0x10];
u8 tis_lst_sz[0x10];
@@ -2562,7 +2582,8 @@ struct mlx5_ifc_rqc_bits {
u8 state[0x4];
u8 reserved_at_c[0x1];
u8 flush_in_error_en[0x1];
-   u8 reserved_at_e[0x12];
+   u8 hairpin[0x1];
+   u8 reserved_at_f[0x11];
 
u8 reserved_at_20[0x8];
u8 user_index[0x18];
@@ -2576,7 +2597,13 @@ struct mlx5_ifc_rqc_bits {
u8 reserved_at_80[0x8];
u8 rmpn[0x18];
 
-   u8 reserved_at_a0[0xe0];
+   u8 reserved_at_a0[0x8];
+   u8 hairpin_peer_sq[0x18];
+
+   u8 reserved_at_c0[0x10];
+   u8 hairpin_peer_vhca[0x10];
+
+   u8 reserved_at_e0[0xa0];
 
struct mlx5_ifc_wq_bits wq;
 };
-- 
2.13.0



RE: [patch iproute2 v6 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-08 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Saturday, January 6, 2018 1:51 AM
> To: Chris Mi ; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com; step...@networkplumber.org;
> marcelo.leit...@gmail.com
> Subject: Re: [patch iproute2 v6 1/3] lib/libnetlink: Add a function
> rtnl_talk_msg
> 
> On 1/4/18 12:34 AM, Chris Mi wrote:
> > rtnl_talk can only send a single message to kernel. Add a new function
> > rtnl_talk_msg that can send multiple messages to kernel.
> >
> > Signed-off-by: Chris Mi 
> > ---
> >  include/libnetlink.h |  3 +++
> >  lib/libnetlink.c | 66 ++
> --
> >  2 files changed, 51 insertions(+), 18 deletions(-)
> >
> 
> I think you should add an argument to rtnl_talk_msg to return the number of
> messages processed. That can be used to refine which line failed. As batch
> size increases the current design puts the burden on the user to scan a lot of
> lines to find the one that fails:
> 
> tc -b tc.batch  -bs 50
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1 Command failed tc.batch:2-51
> 
> We should be able to tell them exactly which line failed.
Done.
> 
> Also, it would be better to call this rtnl_talk_iov, take an iov as an 
> argument
> and have a common rtnl_talk_msg for existing code and this new one.
> 
> As it stands you are having to add:
>struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> 
> to tc functions when it really only needs to know about iov's.
Done.


Re: iscsi target regression due to "tcp: remove prequeue support" patch

2018-01-08 Thread Nicholas A. Bellinger
Hi MNC & Florian,

(Adding net-dev + DaveM CC')

Catching up on pre-holiday threads, thanks for the heads up.

Comments below.

On Wed, 2017-12-13 at 23:56 -0600, Mike Christie wrote:
> Hey Nick and Florian,
> 
> Starting in 4.14 iscsi logins will fail around 50% of the time.
> 
> I git bisected the issue down to this commit:
> 
> commit e7942d0633c47c791ece6afa038be9cf977226de
> Author: Florian Westphal 
> Date:   Sun Jul 30 03:57:18 2017 +0200
> 
> tcp: remove prequeue support
> 
> Nick, attached is the iscsi target log info when the login fails.
> 
> You can see at:
> 
> Dec 13 17:55:01 rhel73n1 kernel: Got Login Command, Flags 0x81, ITT:
> 0x, CmdSN: 0x, ExpStatSN: 0xf86dc69b, CID: 0, Length: 65
> 
> we have got a login command and we seem to then go into
> iscsit_do_rx_data -> sock_recvmsg
> 
> We seem to get stuck in there though, because we stay blocked until:
> 
> Dec 13 17:55:01 rhel73n1 kernel: Entering iscsi_target_sk_data_ready:
> conn: 88b35cbb3000
> Dec 13 17:55:01 rhel73n1 kernel: Got LOGIN_FLAGS_READ_ACTIVE=1, conn:
> 88b35cbb3000 
> 
> where initiator side timeout fires 15 seconds later and it disconnects
> the tcp connection, and we eventually break out of the recvmsg call:
> 
> Dec 13 17:55:16 rhel73n1 kernel: Entering iscsi_target_sk_state_change
> Dec 13 17:55:16 rhel73n1 kernel: __iscsi_target_sk_check_close:
> TCP_CLOSE_WAIT|TCP_CLOSE,returning FALSE
> 
> 
> 
> Dec 13 17:55:16 rhel73n1 kernel: rx_loop: 68, total_rx: 68, data: 68
> Dec 13 17:55:16 rhel73n1 kernel: iscsi_target_do_login_rx after
> rx_login_io, 88b35cbb3000, kworker/2:2:1829
> 

Ok, the 3rd third login request payload (65 + 3 padded to 68 bytes)
containing CHAP_N + CHAP_R keys remains blocked on sock_recvmsg(), until
TPG login_timeout subsequently fires after 15 seconds of inactivity to
terminate this login attempt.

> Is the iscsi target doing something incorrect in its use of
> sk_data_ready and sock_recvmsg or is the tcp patch at fault?

>From the logs, sk_data_ready() -> iscsi_target_sk_data_ready() callbacks
appear firing as expected.

iscsi-target login does iscsit_rx_do_data() -> rx_data() ->
sock_recvmsg(..., MSG_WAITALL) from a system_wq kworker process context
after iscsi_target_sk_data_ready() callback queues up
iscsi_conn->login_work for execution, and sock_recvmsg() uses a single
struct kvec iovec for struct msg_hdr.

AFAICT, iscsi-target uses blocking kernel socket reads from process
context, similar to kernel_recvmsg(..., MSG_WAITALL) with DRBD.

Florian + DaveM, any idea why the removal of prequeue support is having
an effect here..?



[PATCH net-next 2/2] net: hns3: report the function type the same line with hns3_nic_get_stats64

2018-01-08 Thread Peng Li
The function type should be on the same line with the function
name, or it may cause display error if a patch edit the
function. There is am example following:
https://www.spinics.net/lists/netdev/msg476141.html

Signed-off-by: Peng Li 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index b23107d..14c7625 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1126,8 +1126,8 @@ static int hns3_nic_set_features(struct net_device 
*netdev,
return 0;
 }
 
-static void
-hns3_nic_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 
*stats)
+static void hns3_nic_get_stats64(struct net_device *netdev,
+struct rtnl_link_stats64 *stats)
 {
struct hns3_nic_priv *priv = netdev_priv(netdev);
int queue_num = priv->ae_handle->kinfo.num_tqps;
-- 
1.9.1



[PATCH net-next 0/2] code improvements in HNS3 driver

2018-01-08 Thread Peng Li
This patchset fixes 2 comments for community review.
[patch 1/2] reverts "net: hns3: Add packet statistics of netdev"
reported by Jakub Kicinski and David Miller.
[patch 2/2] reports the function type the same line with
hns3_nic_get_stats64, reported by Andrew Lunn.

Peng Li (2):
  Revert "net: hns3: Add packet statistics of netdev"
  net: hns3: report the function type the same line with
hns3_nic_get_stats64

 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c|  4 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 80 +-
 2 files changed, 3 insertions(+), 81 deletions(-)

-- 
1.9.1



[PATCH net-next 1/2] Revert "net: hns3: Add packet statistics of netdev"

2018-01-08 Thread Peng Li
This reverts commit 8491000754796c838a0081c267f9dd54ad2ccba3.

It is duplicate to add statistics of netdev for ethtool -S.

Signed-off-by: Peng Li 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 80 +-
 1 file changed, 1 insertion(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 1e8fac3..d3cb3ec 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -59,41 +59,6 @@ struct hns3_stats {
 
 #define HNS3_TQP_STATS_COUNT (HNS3_TXQ_STATS_COUNT + HNS3_RXQ_STATS_COUNT)
 
-/* netdev stats */
-#define HNS3_NETDEV_STAT(_string, _member) {   \
-   .stats_string = _string,\
-   .stats_offset = offsetof(struct rtnl_link_stats64, _member) \
-}
-
-static const struct hns3_stats hns3_netdev_stats[] = {
-   /* Rx per-queue statistics */
-   HNS3_NETDEV_STAT("rx_packets", rx_packets),
-   HNS3_NETDEV_STAT("tx_packets", tx_packets),
-   HNS3_NETDEV_STAT("rx_bytes", rx_bytes),
-   HNS3_NETDEV_STAT("tx_bytes", tx_bytes),
-   HNS3_NETDEV_STAT("rx_errors", rx_errors),
-   HNS3_NETDEV_STAT("tx_errors", tx_errors),
-   HNS3_NETDEV_STAT("rx_dropped", rx_dropped),
-   HNS3_NETDEV_STAT("tx_dropped", tx_dropped),
-   HNS3_NETDEV_STAT("multicast", multicast),
-   HNS3_NETDEV_STAT("collisions", collisions),
-   HNS3_NETDEV_STAT("rx_length_errors", rx_length_errors),
-   HNS3_NETDEV_STAT("rx_over_errors", rx_over_errors),
-   HNS3_NETDEV_STAT("rx_crc_errors", rx_crc_errors),
-   HNS3_NETDEV_STAT("rx_frame_errors", rx_frame_errors),
-   HNS3_NETDEV_STAT("rx_fifo_errors", rx_fifo_errors),
-   HNS3_NETDEV_STAT("rx_missed_errors", rx_missed_errors),
-   HNS3_NETDEV_STAT("tx_aborted_errors", tx_aborted_errors),
-   HNS3_NETDEV_STAT("tx_carrier_errors", tx_carrier_errors),
-   HNS3_NETDEV_STAT("tx_fifo_errors", tx_fifo_errors),
-   HNS3_NETDEV_STAT("tx_heartbeat_errors", tx_heartbeat_errors),
-   HNS3_NETDEV_STAT("tx_window_errors", tx_window_errors),
-   HNS3_NETDEV_STAT("rx_compressed", rx_compressed),
-   HNS3_NETDEV_STAT("tx_compressed", tx_compressed),
-};
-
-#define HNS3_NETDEV_STATS_COUNT ARRAY_SIZE(hns3_netdev_stats)
-
 #define HNS3_SELF_TEST_TPYE_NUM1
 #define HNS3_NIC_LB_TEST_PKT_NUM   1
 #define HNS3_NIC_LB_TEST_RING_ID   0
@@ -466,27 +431,6 @@ static u8 *hns3_get_strings_tqps(struct hnae3_handle 
*handle, u8 *data)
return data;
 }
 
-static u8 *hns3_netdev_stats_get_strings(u8 *data)
-{
-   int i;
-
-   /* get strings for netdev */
-   for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) {
-   snprintf(data, ETH_GSTRING_LEN,
-hns3_netdev_stats[i].stats_string);
-   data += ETH_GSTRING_LEN;
-   }
-
-   snprintf(data, ETH_GSTRING_LEN, "netdev_rx_dropped");
-   data += ETH_GSTRING_LEN;
-   snprintf(data, ETH_GSTRING_LEN, "netdev_tx_dropped");
-   data += ETH_GSTRING_LEN;
-   snprintf(data, ETH_GSTRING_LEN, "netdev_tx_timeout");
-   data += ETH_GSTRING_LEN;
-
-   return data;
-}
-
 static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 
*data)
 {
struct hnae3_handle *h = hns3_get_handle(netdev);
@@ -498,7 +442,6 @@ static void hns3_get_strings(struct net_device *netdev, u32 
stringset, u8 *data)
 
switch (stringset) {
case ETH_SS_STATS:
-   buff = hns3_netdev_stats_get_strings(buff);
buff = hns3_get_strings_tqps(h, buff);
h->ae_algo->ops->get_strings(h, stringset, (u8 *)buff);
break;
@@ -537,27 +480,6 @@ static u64 *hns3_get_stats_tqps(struct hnae3_handle 
*handle, u64 *data)
return data;
 }
 
-static u64 *hns3_get_netdev_stats(struct net_device *netdev, u64 *data)
-{
-   struct hns3_nic_priv *priv = netdev_priv(netdev);
-   const struct rtnl_link_stats64 *net_stats;
-   struct rtnl_link_stats64 temp;
-   u8 *stat;
-   int i;
-
-   net_stats = dev_get_stats(netdev, &temp);
-   for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) {
-   stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset;
-   *data++ = *(u64 *)stat;
-   }
-
-   *data++ = netdev->rx_dropped.counter;
-   *data++ = netdev->tx_dropped.counter;
-   *data++ = priv->tx_timeout_count;
-
-   return data;
-}
-
 /* hns3_get_stats - get detail statistics.
  * @netdev: net device
  * @stats: statistics info.
@@ -574,7 +496,7 @@ static void hns3_get_stats(struct net_device *netdev,
return;
}
 
-   p = hns3_get_netdev_stats(netdev, p);
+   h->ae_algo->ops->update_stats(h, &netdev->stats);
 
/* get per-queue stats */
p = hns3_get_stats_tqps(h, p);
-- 
1.9.1



[PATCH net-next v3 06/10] net/mlx5e: Change Mellanox references in DIM code

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

Change all appropriate mlx5_am* and MLX5_AM* references to net_dim and
NET_DIM, respectively, in code that handles dynamic interrupt
moderation.  Also change all references from 'am' to 'dim' when used as
local variables and add generic profile references.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  14 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  40 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c  | 286 ++---
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h  |  63 ++---
 7 files changed, 225 insertions(+), 201 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 4ee06e7..4d1d298 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -238,8 +238,8 @@ struct mlx5e_params {
u16 num_channels;
u8  num_tc;
bool rx_cqe_compress_def;
-   struct mlx5e_cq_moder rx_cq_moderation;
-   struct mlx5e_cq_moder tx_cq_moderation;
+   struct net_dim_cq_moder rx_cq_moderation;
+   struct net_dim_cq_moder tx_cq_moderation;
bool lro_en;
u32 lro_wqe_sz;
u16 tx_max_inline;
@@ -249,7 +249,7 @@ struct mlx5e_params {
u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
bool vlan_strip_disable;
bool scatter_fcs_en;
-   bool rx_am_enabled;
+   bool rx_dim_enabled;
u32 lro_timeout;
u32 pflags;
struct bpf_prog *xdp_prog;
@@ -528,7 +528,7 @@ struct mlx5e_rq {
unsigned long  state;
intix;
 
-   struct mlx5e_rx_am am; /* Adaptive Moderation */
+   struct net_dim dim; /* Dynamic Interrupt Moderation */
 
/* XDP */
struct bpf_prog   *xdp_prog;
@@ -1079,4 +1079,5 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
u16 max_channels);
 u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev);
+void mlx5e_rx_dim_work(struct work_struct *work);
 #endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index b9b434b..f620325 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -32,17 +32,17 @@
 
 #include "en.h"
 
-void mlx5e_rx_am_work(struct work_struct *work)
+void mlx5e_rx_dim_work(struct work_struct *work)
 {
-   struct mlx5e_rx_am *am = container_of(work, struct mlx5e_rx_am,
- work);
-   struct mlx5e_rq *rq = container_of(am, struct mlx5e_rq, am);
-   struct mlx5e_cq_moder cur_profile = mlx5e_am_get_profile(am->mode,
-
am->profile_ix);
+   struct net_dim *dim = container_of(work, struct net_dim,
+  work);
+   struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
+   struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
+ 
dim->profile_ix);
 
mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
   cur_profile.usec, cur_profile.pkts);
 
-   am->state = MLX5E_AM_START_MEASURE;
+   dim->state = NET_DIM_START_MEASURE;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 8f05efa..51ae6df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -480,7 +480,7 @@ int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
coal->rx_max_coalesced_frames = 
priv->channels.params.rx_cq_moderation.pkts;
coal->tx_coalesce_usecs   = 
priv->channels.params.tx_cq_moderation.usec;
coal->tx_max_coalesced_frames = 
priv->channels.params.tx_cq_moderation.pkts;
-   coal->use_adaptive_rx_coalesce = priv->channels.params.rx_am_enabled;
+   coal->use_adaptive_rx_coalesce = priv->channels.params.rx_dim_enabled;
 
return 0;
 }
@@ -534,7 +534,7 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
new_channels.params.tx_cq_moderation.pkts = 
coal->tx_max_coalesced_frames;
new_channels.params.rx_cq_moderation.usec = coal->rx_coalesce_usecs;
new_channels.params.rx_cq_moderation.pkts = 
coal->rx_max_coalesced_frames;
-   new_channels.params.rx_am_enabled = 
!!coal->use_adaptive_rx_coalesce;
+   new_channels.params.rx_dim_enabled 

[PATCH net-next v3 05/10] net/mlx5e: Move generic functions to new file

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

These functions were identified as ones that could be made generic and
used by multiple drivers.  Most of the contents of en_rx_am.c are moved
to net_dim.c.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  48 
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h   | 102 ---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 320 -
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c  | 307 
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h  | 103 +++
 7 files changed, 461 insertions(+), 425 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 19b21b4..b46b6de2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
fpga/ipsec.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
-   en_tx.o en_rx.o en_rx_am.o en_txrx.o en_stats.o vxlan.o \
-   en_arfs.o en_fs_ethtool.o en_selftest.o
+   en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
+   en_arfs.o en_fs_ethtool.o en_selftest.o net_dim.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e2e35ed..4ee06e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -50,7 +50,7 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
-#include "en_dim.h"
+#include "net_dim.h"
 
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
new file mode 100644
index 000..b9b434b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+void mlx5e_rx_am_work(struct work_struct *work)
+{
+   struct mlx5e_rx_am *am = container_of(work, struct mlx5e_rx_am,
+ work);
+   struct mlx5e_rq *rq = container_of(am, struct mlx5e_rq, am);
+   struct mlx5e_cq_moder cur_profile = mlx5e_am_get_profile(am->mode,
+
am->profile_ix);
+
+   mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
+  cur_profile.usec, cur_profile.pkts);
+
+   am->state = MLX5E_AM_START_MEASURE;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
deleted file mode 100644
index a1497bab..000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2013-2015, Mellanox Technologies

[PATCH net-next v3 03/10] net/mlx5e: Remove rq references in mlx5e_rx_am

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

This makes mlx5e_am_sample more generic so that it can be called easily
from a driver that does not use the same data structure to store these
values in a single structure.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h   |  6 --
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 22 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  5 -
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
index 2031a21..7d5499a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
@@ -66,8 +66,10 @@ struct mlx5e_rx_am { /* Adaptive Moderation */
u8  tired;
 };
 
-struct mlx5e_rq;
-void mlx5e_rx_am(struct mlx5e_rq *rq);
+void mlx5e_rx_am(struct mlx5e_rx_am *am,
+u16 event_ctr,
+u64 packets,
+u64 bytes);
 void mlx5e_rx_am_work(struct work_struct *work);
 struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
index e401d9d..1630076 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
@@ -264,13 +264,15 @@ static bool mlx5e_am_decision(struct mlx5e_rx_am_stats 
*curr_stats,
return am->profile_ix != prev_ix;
 }
 
-static void mlx5e_am_sample(struct mlx5e_rq *rq,
+static void mlx5e_am_sample(u16 event_ctr,
+   u64 packets,
+   u64 bytes,
struct mlx5e_rx_am_sample *s)
 {
s->time  = ktime_get();
-   s->pkt_ctr   = rq->stats.packets;
-   s->byte_ctr  = rq->stats.bytes;
-   s->event_ctr = rq->cq.event_ctr;
+   s->pkt_ctr   = packets;
+   s->byte_ctr  = bytes;
+   s->event_ctr = event_ctr;
 }
 
 #define MLX5E_AM_NEVENTS 64
@@ -309,20 +311,22 @@ void mlx5e_rx_am_work(struct work_struct *work)
am->state = MLX5E_AM_START_MEASURE;
 }
 
-void mlx5e_rx_am(struct mlx5e_rq *rq)
+void mlx5e_rx_am(struct mlx5e_rx_am *am,
+u16 event_ctr,
+u64 packets,
+u64 bytes)
 {
-   struct mlx5e_rx_am *am = &rq->am;
struct mlx5e_rx_am_sample end_sample;
struct mlx5e_rx_am_stats curr_stats;
u16 nevents;
 
switch (am->state) {
case MLX5E_AM_MEASURE_IN_PROGRESS:
-   nevents = BIT_GAP(BITS_PER_TYPE(u16), rq->cq.event_ctr,
+   nevents = BIT_GAP(BITS_PER_TYPE(u16), event_ctr,
  am->start_sample.event_ctr);
if (nevents < MLX5E_AM_NEVENTS)
break;
-   mlx5e_am_sample(rq, &end_sample);
+   mlx5e_am_sample(event_ctr, packets, bytes, &end_sample);
mlx5e_am_calc_stats(&am->start_sample, &end_sample,
&curr_stats);
if (mlx5e_am_decision(&curr_stats, am)) {
@@ -332,7 +336,7 @@ void mlx5e_rx_am(struct mlx5e_rq *rq)
}
/* fall through */
case MLX5E_AM_START_MEASURE:
-   mlx5e_am_sample(rq, &am->start_sample);
+   mlx5e_am_sample(event_ctr, packets, bytes, &am->start_sample);
am->state = MLX5E_AM_MEASURE_IN_PROGRESS;
break;
case MLX5E_AM_APPLY_NEW_PROFILE:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index ab92298..1849169 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -79,7 +79,10 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
mlx5e_cq_arm(&c->sq[i].cq);
 
if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM))
-   mlx5e_rx_am(&c->rq);
+   mlx5e_rx_am(&c->rq.am,
+   c->rq.cq.event_ctr,
+   c->rq.stats.packets,
+   c->rq.stats.bytes);
 
mlx5e_cq_arm(&c->rq.cq);
mlx5e_cq_arm(&c->icosq.cq);
-- 
2.7.4



[PATCH net-next v3 02/10] net/mlx5e: Move interrupt moderation forward declarations

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

Move these to newly created file to prepare to move these functions to a
library.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 5 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index df9cbb3..e2e35ed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -833,10 +833,6 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
 
-void mlx5e_rx_am(struct mlx5e_rq *rq);
-void mlx5e_rx_am_work(struct work_struct *work);
-struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode);
-
 void mlx5e_update_stats(struct mlx5e_priv *priv, bool full);
 
 int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
index 9eeaa11..2031a21 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
@@ -66,4 +66,9 @@ struct mlx5e_rx_am { /* Adaptive Moderation */
u8  tired;
 };
 
+struct mlx5e_rq;
+void mlx5e_rx_am(struct mlx5e_rq *rq);
+void mlx5e_rx_am_work(struct work_struct *work);
+struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode);
+
 #endif /* MLX5_AM_H */
-- 
2.7.4



[PATCH net-next v3 10/10] MAINTAINERS: add entry for Dynamic Interrupt Moderation

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

Signed-off-by: Andy Gospodarek 
Signed-off-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 MAINTAINERS | 5 +
 1 file changed, 5 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 753799d..178239dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4944,6 +4944,11 @@ S:   Maintained
 F: lib/dynamic_debug.c
 F: include/linux/dynamic_debug.h
 
+DYNAMIC INTERRUPT MODERATION
+M: Tal Gilboa 
+S: Maintained
+F: include/linux/net_dim.h
+
 DZ DECSTATION DZ11 SERIAL DRIVER
 M: "Maciej W. Rozycki" 
 S: Maintained
-- 
2.7.4



[PATCH net-next v3 08/10] net/dim: use struct net_dim_sample as arg to net_dim

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

Simplify the arguments net_dim() by formatting them into a struct
net_dim_sample before calling the function.

Signed-off-by: Andy Gospodarek 
Suggested-by: Tal Gilboa 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 13 -
 include/linux/net_dim.h   | 10 +++---
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index a1c94fd..f292bb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -78,11 +78,14 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
for (i = 0; i < c->num_tc; i++)
mlx5e_cq_arm(&c->sq[i].cq);
 
-   if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM))
-   net_dim(&c->rq.dim,
-   c->rq.cq.event_ctr,
-   c->rq.stats.packets,
-   c->rq.stats.bytes);
+   if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) {
+   struct net_dim_sample dim_sample;
+   net_dim_sample(c->rq.cq.event_ctr,
+  c->rq.stats.packets,
+  c->rq.stats.bytes,
+  &dim_sample);
+   net_dim(&c->rq.dim, dim_sample);
+   }
 
mlx5e_cq_arm(&c->rq.cq);
mlx5e_cq_arm(&c->icosq.cq);
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 741510f..1c7e450 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -342,21 +342,18 @@ static inline void net_dim_calc_stats(struct 
net_dim_sample *start,
 }
 
 static inline void net_dim(struct net_dim *dim,
-  u16 event_ctr,
-  u64 packets,
-  u64 bytes)
+  struct net_dim_sample end_sample)
 {
-   struct net_dim_sample end_sample;
struct net_dim_stats curr_stats;
u16 nevents;
 
switch (dim->state) {
case NET_DIM_MEASURE_IN_PROGRESS:
-   nevents = BIT_GAP(BITS_PER_TYPE(u16), event_ctr,
+   nevents = BIT_GAP(BITS_PER_TYPE(u16),
+ end_sample.event_ctr,
  dim->start_sample.event_ctr);
if (nevents < NET_DIM_NEVENTS)
break;
-   net_dim_sample(event_ctr, packets, bytes, &end_sample);
net_dim_calc_stats(&dim->start_sample, &end_sample,
   &curr_stats);
if (net_dim_decision(&curr_stats, dim)) {
@@ -366,7 +363,6 @@ static inline void net_dim(struct net_dim *dim,
}
/* fall through */
case NET_DIM_START_MEASURE:
-   net_dim_sample(event_ctr, packets, bytes, &dim->start_sample);
dim->state = NET_DIM_MEASURE_IN_PROGRESS;
break;
case NET_DIM_APPLY_NEW_PROFILE:
-- 
2.7.4



[PATCH net-next v3 01/10] net/mlx5e: Move interrupt moderation structs to new file

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

Create new header file to prepare to move code that handles irq
moderation to a library that lives in a header file.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h | 33 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h | 69 
 2 files changed, 70 insertions(+), 32 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5299310..df9cbb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -50,6 +50,7 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
+#include "en_dim.h"
 
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
@@ -227,12 +228,6 @@ enum mlx5e_priv_flag {
 #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
 #endif
 
-struct mlx5e_cq_moder {
-   u16 usec;
-   u16 pkts;
-   u8 cq_period_mode;
-};
-
 struct mlx5e_params {
u8  log_sq_size;
u8  rq_wq_type;
@@ -473,32 +468,6 @@ struct mlx5e_mpw_info {
u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
 };
 
-struct mlx5e_rx_am_stats {
-   int ppms; /* packets per msec */
-   int bpms; /* bytes per msec */
-   int epms; /* events per msec */
-};
-
-struct mlx5e_rx_am_sample {
-   ktime_t time;
-   u32 pkt_ctr;
-   u32 byte_ctr;
-   u16 event_ctr;
-};
-
-struct mlx5e_rx_am { /* Adaptive Moderation */
-   u8  state;
-   struct mlx5e_rx_am_statsprev_stats;
-   struct mlx5e_rx_am_sample   start_sample;
-   struct work_struct  work;
-   u8  profile_ix;
-   u8  mode;
-   u8  tune_state;
-   u8  steps_right;
-   u8  steps_left;
-   u8  tired;
-};
-
 /* a single cache unit is capable to serve one napi call (for non-striding rq)
  * or a MPWQE (for striding rq).
  */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
new file mode 100644
index 000..9eeaa11
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ * Copyright (c) 2017-2018, Broadcom Limited
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_AM_H
+#define MLX5_AM_H
+
+struct mlx5e_cq_moder {
+   u16 usec;
+   u16 pkts;
+   u8 cq_period_mode;
+};
+
+struct mlx5e_rx_am_sample {
+   ktime_t time;
+   u32 pkt_ctr;
+   u32 byte_ctr;
+   u16 event_ctr;
+};
+
+struct mlx5e_rx_am_stats {
+   int ppms; /* packets per msec */
+   int bpms; /* bytes per msec */
+   int epms; /* events per msec */
+};
+
+struct mlx5e_rx_am { /* Adaptive Moderation */
+   u8  state;
+   struct mlx5e_rx_am_statsprev_stats;
+   struct mlx5e_rx_am_sample   start_sample;
+   struct work_struct  work;
+   u8  profile_ix;
+   u8  mode;
+   u8  tune_state;
+   u

[PATCH net-next v3 07/10] net/mlx5e: Move dynamic interrupt coalescing code to include/linux

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

This move allows drivers to add private structure elements to track the
number of packets, bytes, and interrupts events per ring.  A driver
also defines a workqueue handler to act on this collected data once per
poll and modify the coalescing parameters per ring.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c | 307 --
 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h | 108 ---
 include/linux/net_dim.h   | 377 ++
 6 files changed, 380 insertions(+), 417 deletions(-)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/net_dim.h
 create mode 100644 include/linux/net_dim.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index b46b6de2..c805769 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
-   en_arfs.o en_fs_ethtool.o en_selftest.o net_dim.o
+   en_arfs.o en_fs_ethtool.o en_selftest.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 4d1d298..29b9675 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -47,10 +47,10 @@
 #include 
 #include 
 #include 
+#include 
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
-#include "net_dim.h"
 
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index f620325..2b89951 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include 
 #include "en.h"
 
 void mlx5e_rx_dim_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c 
b/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c
deleted file mode 100644
index decb370..000
--- a/drivers/net/ethernet/mellanox/mlx5/core/net_dim.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- * Copyright (c) 2017-2018, Broadcom Limited. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- *  - Redistributions of source code must retain the above
- *copyright notice, this list of conditions and the following
- *disclaimer.
- *
- *  - Redistributions in binary form must reproduce the above
- *copyright notice, this list of conditions and the following
- *disclaimer in the documentation and/or other materials
- *provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "en.h"
-
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-/* Adaptive moderation profiles */
-#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
-/* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */
-#define NET_DIM_EQE_PROFILES { \
-   {1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-   {8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-   {64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-   {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-   {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-}
-
-#define NET_DIM_CQE_PROFILES

[PATCH net-next v3 09/10] bnxt_en: add support for software dynamic interrupt moderation

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

This implements the changes needed for the bnxt_en driver to add support
for dynamic interrupt moderation per ring.

This does add additional counters in the receive path, but testing shows
that any additional instructions are offset by throughput gain when the
default configuration is for low latency.

Signed-off-by: Andy Gospodarek 
Cc: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/Makefile   |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 50 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 34 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c | 33 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 12 ++
 5 files changed, 119 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c

diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile 
b/drivers/net/ethernet/broadcom/bnxt/Makefile
index 59c8ec9..7c560d5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_BNXT) += bnxt_en.o
 
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o 
bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o 
bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o
 bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 89c3c87..cf6ebf1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1645,6 +1645,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi 
*bnapi, u32 *raw_cons,
rxr->rx_next_cons = NEXT_RX(cons);
 
 next_rx_no_prod:
+   cpr->rx_packets += 1;
+   cpr->rx_bytes += len;
*raw_cons = tmp_raw_cons;
 
return rc;
@@ -1802,6 +1804,7 @@ static irqreturn_t bnxt_msix(int irq, void *dev_instance)
struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
u32 cons = RING_CMP(cpr->cp_raw_cons);
 
+   cpr->event_ctr++;
prefetch(&cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)]);
napi_schedule(&bnapi->napi);
return IRQ_HANDLED;
@@ -2025,6 +2028,15 @@ static int bnxt_poll(struct napi_struct *napi, int 
budget)
break;
}
}
+   if (bp->flags & BNXT_FLAG_DIM) {
+   struct net_dim_sample dim_sample;
+
+   net_dim_sample(cpr->event_ctr,
+  cpr->rx_packets,
+  cpr->rx_bytes,
+  &dim_sample);
+   net_dim(&cpr->dim, dim_sample);
+   }
mmiowb();
return work_done;
 }
@@ -2617,6 +2629,8 @@ static void bnxt_init_cp_rings(struct bnxt *bp)
struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
 
ring->fw_ring_id = INVALID_HW_RING_ID;
+   cpr->rx_ring_coal.coal_ticks = bp->rx_coal.coal_ticks;
+   cpr->rx_ring_coal.coal_bufs = bp->rx_coal.coal_bufs;
}
 }
 
@@ -4593,6 +4607,36 @@ static void bnxt_hwrm_set_coal_params(struct bnxt_coal 
*hw_coal,
req->flags = cpu_to_le16(flags);
 }
 
+int bnxt_hwrm_set_ring_coal(struct bnxt *bp, struct bnxt_napi *bnapi)
+{
+   struct hwrm_ring_cmpl_ring_cfg_aggint_params_input req_rx = {0};
+   struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+   struct bnxt_coal coal;
+   unsigned int grp_idx;
+
+   /* Tick values in micro seconds.
+* 1 coal_buf x bufs_per_record = 1 completion record.
+*/
+   memcpy(&coal, &bp->rx_coal, sizeof(struct bnxt_coal));
+
+   coal.coal_ticks = cpr->rx_ring_coal.coal_ticks;
+   coal.coal_bufs = cpr->rx_ring_coal.coal_bufs;
+
+   if (!bnapi->rx_ring)
+   return -ENODEV;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req_rx,
+  HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS, -1, -1);
+
+   bnxt_hwrm_set_coal_params(&coal, &req_rx);
+
+   grp_idx = bnapi->index;
+   req_rx.ring_id = cpu_to_le16(bp->grp_info[grp_idx].cp_fw_ring_id);
+
+   return hwrm_send_message(bp, &req_rx, sizeof(req_rx),
+HWRM_CMD_TIMEOUT);
+}
+
 int bnxt_hwrm_set_coal(struct bnxt *bp)
 {
int i, rc = 0;
@@ -5715,7 +5759,13 @@ static void bnxt_enable_napi(struct bnxt *bp)
int i;
 
for (i = 0; i < bp->cp_nr_rings; i++) {
+   struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
bp->bnapi[i]->in_reset = false;
+
+   if (bp->bnapi[i]->rx_ring) {
+   INIT_WORK(&cpr->dim.work, bnxt_dim_work);
+   cpr->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+   }
napi_enable(&bp->bnapi[i]->napi);
}
 }
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt

[PATCH net-next v3 04/10] net/mlx5e: Move AM logic enums

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

More movement to help make this code more generic.

Signed-off-by: Andy Gospodarek 
Acked-by: Tal Gilboa 
Acked-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.h   | 26 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 25 -
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
index 7d5499a..a1497bab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.h
@@ -66,6 +66,32 @@ struct mlx5e_rx_am { /* Adaptive Moderation */
u8  tired;
 };
 
+/* Adaptive moderation logic */
+enum {
+   MLX5E_AM_START_MEASURE,
+   MLX5E_AM_MEASURE_IN_PROGRESS,
+   MLX5E_AM_APPLY_NEW_PROFILE,
+};
+
+enum {
+   MLX5E_AM_PARKING_ON_TOP,
+   MLX5E_AM_PARKING_TIRED,
+   MLX5E_AM_GOING_RIGHT,
+   MLX5E_AM_GOING_LEFT,
+};
+
+enum {
+   MLX5E_AM_STATS_WORSE,
+   MLX5E_AM_STATS_SAME,
+   MLX5E_AM_STATS_BETTER,
+};
+
+enum {
+   MLX5E_AM_STEPPED,
+   MLX5E_AM_TOO_TIRED,
+   MLX5E_AM_ON_EDGE,
+};
+
 void mlx5e_rx_am(struct mlx5e_rx_am *am,
 u16 event_ctr,
 u64 packets,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
index 1630076..337dd60 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
@@ -82,31 +82,6 @@ struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 
rx_cq_period_mode)
return mlx5e_am_get_profile(rx_cq_period_mode, default_profile_ix);
 }
 
-/* Adaptive moderation logic */
-enum {
-   MLX5E_AM_START_MEASURE,
-   MLX5E_AM_MEASURE_IN_PROGRESS,
-   MLX5E_AM_APPLY_NEW_PROFILE,
-};
-
-enum {
-   MLX5E_AM_PARKING_ON_TOP,
-   MLX5E_AM_PARKING_TIRED,
-   MLX5E_AM_GOING_RIGHT,
-   MLX5E_AM_GOING_LEFT,
-};
-
-enum {
-   MLX5E_AM_STATS_WORSE,
-   MLX5E_AM_STATS_SAME,
-   MLX5E_AM_STATS_BETTER,
-};
-
-enum {
-   MLX5E_AM_STEPPED,
-   MLX5E_AM_TOO_TIRED,
-   MLX5E_AM_ON_EDGE,
-};
 
 static bool mlx5e_am_on_top(struct mlx5e_rx_am *am)
 {
-- 
2.7.4



[PATCH net-next v3 00/10] net: create dynamic software irq moderation library

2018-01-08 Thread Andy Gospodarek
From: Andy Gospodarek 

This converts the dynamic interrupt moderation library from the mlx5e
driver into a library so it can be used by any driver.  The penultimate
patch in this set adds support for thiw new dynamic interrupt moderation
library in the bnxt_en driver and the last patch creates an entry in the
MAINTAINERS file for this library.

The main purpose of this code is to allow an administrator to make sure
that default coalesce settings are optimized for low latency, but
quickly adapt to handle high throughput/bulk traffic by altering how
much time passes before popping an interrupt.

For any new driver the following changes would be needed to use this
library:

- add elements in ring struct to track items needed by this library
- create function that can be called to actually set coalesce settings
  for the driver

Credit to Rob Rice and Lee Reed for doing some of the initial proof of
concept and testing for this patch and Tal Gilboa and Or Gerlitz for
their comments, etc on this set.

v3: bnxt_en fix from Michael Chan, comment suggestion from Vasundhara
Volam, and small mlx5e header file fix from Tal Gilboa.

v2: Spelling fixes from Stephen Hemminger, bnxt_en suggestions from
Michael Chan, spelling and formatting fixes from Or Gerlitz, and
spelling and mlx5e changes suggested by Tal Gilboa.

Andy Gospodarek (10):
  net/mlx5e: Move interrupt moderation structs to new file
  net/mlx5e: Move interrupt moderation forward declarations
  net/mlx5e: Remove rq references in mlx5e_rx_am
  net/mlx5e: Move AM logic enums
  net/mlx5e: Move generic functions to new file
  net/mlx5e: Change Mellanox references in DIM code
  net/mlx5e: Move dynamic interrupt coalescing code to include/linux
  net/dim: use struct net_dim_sample as arg to net_dim
  bnxt_en: add support for software dynamic interrupt moderation
  MAINTAINERS: add entry for Dynamic Interrupt Moderation

 MAINTAINERS|   5 +
 drivers/net/ethernet/broadcom/bnxt/Makefile|   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c  |  50 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h  |  34 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c  |  33 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |  12 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  46 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  49 +++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  40 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 341 ---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  10 +-
 include/linux/net_dim.h| 373 +
 14 files changed, 593 insertions(+), 410 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
 create mode 100644 include/linux/net_dim.h

-- 
2.7.4



Re: WARNING: held lock freed!

2018-01-08 Thread Xin Long
On Mon, Jan 8, 2018 at 8:09 PM, Marcelo Ricardo Leitner
 wrote:
> On Mon, Jan 08, 2018 at 06:01:14PM +0800, Xin Long wrote:
>> On Mon, Jan 8, 2018 at 6:58 AM, syzbot
>>  wrote:
>> > Hello,
>> >
>> > syzkaller hit the following crash on
>> > 3219e264b984ec0a13923aa66385819c2e98d582
>> > git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
>> > compiler: gcc (GCC) 7.1.1 20170620
>> > .config is attached
>> > Raw console output is attached.
>> > C reproducer is attached
>> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ
>> > for information about syzkaller reproducers
>> >
>> >
>> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
>> > Reported-by: syzbot+ac6ea7baa4432811e...@syzkaller.appspotmail.com
>> > It will help syzbot understand when the bug is fixed. See footer for
>> > details.
>> > If you forward the report, please keep this part and the footer.
>> >
>> >
>> > =
>> > WARNING: held lock freed!
>> > 4.15.0-rc6+ #250 Not tainted
>> > -
>> > syzkaller065230/3505 is freeing memory 7fcba654-499fef26,
>> > with a lock still held there!
>> >  (sk_lock-AF_INET6){+.+.}, at: [] lock_sock
>> > include/net/sock.h:1461 [inline]
>> >  (sk_lock-AF_INET6){+.+.}, at: []
>> > sctp_wait_for_sndbuf+0x509/0x8d0 net/sctp/socket.c:8042
>> > 1 lock held by syzkaller065230/3505:
>> >  #0:  (sk_lock-AF_INET6){+.+.}, at: [] lock_sock
>> > include/net/sock.h:1461 [inline]
>> >  #0:  (sk_lock-AF_INET6){+.+.}, at: []
>> > sctp_wait_for_sndbuf+0x509/0x8d0 net/sctp/socket.c:8042
>> >
>> > stack backtrace:
>> > CPU: 0 PID: 3505 Comm: syzkaller065230 Not tainted 4.15.0-rc6+ #250
>> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>> > Google 01/01/2011
>> > Call Trace:
>> >  __dump_stack lib/dump_stack.c:17 [inline]
>> >  dump_stack+0x194/0x257 lib/dump_stack.c:53
>> >  print_freed_lock_bug kernel/locking/lockdep.c:4379 [inline]
>> >  debug_check_no_locks_freed+0x32f/0x3c0 kernel/locking/lockdep.c:4412
>> >  kmem_cache_free+0x68/0x2a0 mm/slab.c:3743
>> >  sk_prot_free net/core/sock.c:1504 [inline]
>> >  __sk_destruct+0x622/0x910 net/core/sock.c:1585
>> >  sk_destruct+0x47/0x80 net/core/sock.c:1593
>> >  __sk_free+0x57/0x230 net/core/sock.c:1601
>> >  sk_free+0x2a/0x40 net/core/sock.c:1612
>> >  sock_put include/net/sock.h:1656 [inline]
>> >  sctp_association_destroy net/sctp/associola.c:424 [inline]
>> >  sctp_association_put+0x14c/0x2f0 net/sctp/associola.c:883
>> >  sctp_wait_for_sndbuf+0x673/0x8d0 net/sctp/socket.c:8053
>> >  sctp_sendmsg+0x277d/0x3360 net/sctp/socket.c:1974
>> >  inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>> >  sock_sendmsg_nosec net/socket.c:636 [inline]
>> >  sock_sendmsg+0xca/0x110 net/socket.c:646
>> >  SYSC_sendto+0x361/0x5c0 net/socket.c:1727
>> >  SyS_sendto+0x40/0x50 net/socket.c:1695
>> >  entry_SYSCALL_64_fastpath+0x23/0x9a
>> > RIP: 0033:0x445db9
>> > RSP: 002b:7f2467415d98 EFLAGS: 0212 ORIG_RAX: 002c
>> > RAX: ffda RBX: 006dbc84 RCX: 00445db9
>> > RDX: 0001 RSI: 2010bf14 RDI: 0005
>> > RBP:  R08: 204d9000 R09: 001c
>> > R10:  R11: 0212 R12: 006dbc80
>> > R13: 209a9000 R14: 0100 R15: 0001
>> > ==
>> > BUG: KASAN: use-after-free in debug_spin_lock_before
>> > kernel/locking/spinlock_debug.c:83 [inline]
>> > BUG: KASAN: use-after-free in do_raw_spin_lock+0x1e0/0x220
>> > kernel/locking/spinlock_debug.c:112
>> > Read of size 4 at addr 8801c17ab08c by task syzkaller065230/3505
>>
>> It seems this commit is buggy:
>> commit cea0cc80a6777beb6eb643d4ad53690e1ad1d4ff
>> Author: Xin Long 
>> Date:   Wed Nov 15 16:57:26 2017 +0800
>>
>> sctp: use the right sk after waking up from wait_buf sleep
>>
>> I already had a patch to drop it and fix the old issue with a better way.
>
> Maybe you can quickly share it, so we have an idea what you are
> talking about?
Sure, in sctp_wait_for_sndbuf().

lock_sock(sk);
-   if (sk != asoc->base.sk) {
-   release_sock(sk);
-   sk = asoc->base.sk;
-   lock_sock(sk);
-   }
+   if (sk != asoc->base.sk)
+   goto do_error;

I had this patch when doing cleanup in sctp_sendmsg, it will also
make that cleanup easier.

Some comments for it:

After commit cea0cc80a677 ("sctp: use the right sk after waking up from
wait_buf sleep"), it may change to lock another sk if the asoc has been
peeled off in sctp_wait_for_sndbuf.

However, the asoc's new sk could be already closed elsewhere, as it's in
the sendmsg context of the old sk that can't avoid the new sk's closing.
   

[PATCH v2 bpf] bpf: introduce BPF_JIT_ALWAYS_ON config

2018-01-08 Thread Alexei Starovoitov
The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715.

A quote from goolge project zero blog:
"At this point, it would normally be necessary to locate gadgets in
the host kernel code that can be used to actually leak data by reading
from an attacker-controlled location, shifting and masking the result
appropriately and then using the result of that as offset to an
attacker-controlled address for a load. But piecing gadgets together
and figuring out which ones work in a speculation context seems annoying.
So instead, we decided to use the eBPF interpreter, which is built into
the host kernel - while there is no legitimate way to invoke it from inside
a VM, the presence of the code in the host kernel's text section is sufficient
to make it usable for the attack, just like with ordinary ROP gadgets."

To make attacker job harder introduce BPF_JIT_ALWAYS_ON config
option that removes interpreter from the kernel in favor of JIT-only mode.
So far eBPF JIT is supported by:
x64, arm64, arm32, sparc64, s390, powerpc64, mips64

The start of JITed program is randomized and code page is marked as read-only.
In addition "constant blinding" can be turned on with net.core.bpf_jit_harden

v1->v2:
- fix init order, test_bpf and cBPF (Daniel's feedback)
- fix offloaded bpf (Jakub's feedback)
- add 'return 0' dummy in case something can invoke prog->bpf_func
- retarget bpf tree. For bpf-next the patch would need one extra hunk.
  It will be sent when the trees are merged back to net-next

Considered doing:
  int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT;
but it seems better to land the patch as-is and in bpf-next remove
bpf_jit_enable global variable from all JITs, consolidate in one place
and remove this jit_init() function.

Signed-off-by: Alexei Starovoitov 
---
 init/Kconfig   |  7 +++
 kernel/bpf/core.c  | 19 +++
 lib/test_bpf.c | 11 +++
 net/core/filter.c  |  6 ++
 net/core/sysctl_net_core.c |  6 ++
 net/socket.c   |  9 +
 6 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 2934249fba46..5e2a4a391ba9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1392,6 +1392,13 @@ config BPF_SYSCALL
  Enable the bpf() system call that allows to manipulate eBPF
  programs and maps via file descriptors.
 
+config BPF_JIT_ALWAYS_ON
+   bool "Permanently enable BPF JIT and remove BPF interpreter"
+   depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+   help
+ Enables BPF JIT and removes BPF interpreter to avoid
+ speculative execution of BPF instructions by the interpreter
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa26ee8..b529982c3126 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 
r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
 
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
  * __bpf_prog_run - run eBPF program on a given context
  * @ctx: is the data we are operating on
@@ -1317,6 +1318,8 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
 EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 };
 
+#endif
+
 bool bpf_prog_array_compatible(struct bpf_array *array,
   const struct bpf_prog *fp)
 {
@@ -1354,6 +1357,12 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0;
 }
 
+static unsigned int __bpf_prog_ret0(const void *ctx,
+   const struct bpf_insn *insn)
+{
+   return 0;
+}
+
 /**
  * bpf_prog_select_runtime - select exec runtime for BPF program
  * @fp: bpf_prog populated with internal BPF program
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
 
fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+   fp->bpf_func = __bpf_prog_ret0;
+#endif
 
/* eBPF JITs can rewrite the program in case constant
 * blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog 
*fp, int *err)
 */
if (!bpf_prog_is_dev_bound(fp->aux)) {
fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+   if (!fp->jited) {
+   *err = -ENOTSUPP;
+   return fp;
+   }
+#endif
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 9e9748089270..f369889e521d 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6250,9 +6250,8 @@ stati

Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution

2018-01-08 Thread Linus Torvalds
On Mon, Jan 8, 2018 at 8:13 PM, Linus Torvalds
 wrote:
>
> # carry will be clear if idx >= max
> cmpq %idx,%max

Bah. Other way around.

cmpq %max,%idx

I'm a moron.

> # mask will be clear if carry was clear, ~0 otherwise
> sbbq %mask,%mask
>
> to generate mask directly. I might have screwed that up. Worth perhaps trying?

More importantly, worth _testing_ and fixing my hand-waving "asm like
this" crap.

But I do think that simple two-instruction cmpq/sbbq sequence could
get it right in just two trivial ALU instructions.

  Linus


Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution

2018-01-08 Thread Linus Torvalds
On Mon, Jan 8, 2018 at 7:42 PM, Dan Williams  wrote:
>
> originally from Linus and tweaked by Alexei and I:

Sadly, that tweak - while clever - is wrong.

> unsigned long _mask = ~(long)(_m - 1 - _i) >> BITS_PER_LONG - 1;\

Why?

Because "(long)(_m-1-_i)" is not negative just because "i >= m". It
can still be positive.

Think "m = 100", "i=bignum". The subtraction will overflow and become
positive again, the shift will shift to zero, and then the mask will
become ~0.

Now, you can fix it, but you need to be a tiny bit more clever.  In
particular, just make sure that you retain the high bit of "_i",
basically making the rule be that a negative index is not ever valid.

And then instead of "(_m - 1 - _i)", you use "(_i | (_m - 1 - _i))".
Now the sign bit is set if _i had it set, _or_ if the subtraction
turned negative, and you don't have to worry about the overflow
situation.

But it does require that extra step to be trustworthy. Still purely
cheap arithmetic operations, although there is possibly some
additional register pressure there.

Somebody might be able to come up with something even more minimal (or
find a fault in my fix of your tweak).

Obviously, with architecture-specific code, you may well be able to do
better, using the carry flag of the subtraction.

For example, on x86, I think you could do it with just two instructions:

# carry will be clear if idx >= max
cmpq %idx,%max

# mask will be clear if carry was clear, ~0 otherwise
sbbq %mask,%mask

to generate mask directly. I might have screwed that up. Worth perhaps trying?

   Linus


Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution

2018-01-08 Thread Dan Williams
On Mon, Jan 8, 2018 at 7:11 PM, Eric W. Biederman  wrote:
> Dan Williams  writes:
>
>> Static analysis reports that 'index' may be a user controlled value that
>> is used as a data dependency reading 'rt' from the 'platform_label'
>> array.  In order to avoid potential leaks of kernel memory values, block
>> speculative execution of the instruction stream that could issue further
>> reads based on an invalid 'rt' value.
>
>
> In detail.
> a) This code is fast path packet forwarding code.  Introducing an
>unconditional pipeline stall is not ok.
>
>AKA either there is no speculation and so this is invulnerable
>or there is speculation and you are creating an unconditional
>pipeline stall here.
>
>My back of the napkin caluculations say that a pipeline stall
>is about 20 cycles.  Which is about the same length of time
>as a modern cache miss.
>
>On a good day this code will perform with 0 cache misses. On a less
>good day 1 cache miss.  Which means you are quite possibly doubling
>the runtime of mpls_forward.
>
> b) The array is dynamically allocated which should provide some
>protection, as it will be more difficult to predict the address
>of the array which is needed to craft an malicious userspace value.
>
> c) The code can be trivially modified to say:
>
> static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned 
> index)
> {
> struct mpls_route *rt = NULL;
>
> if (index < net->mpls.platform_labels) {
> struct mpls_route __rcu **platform_label =
> rcu_dereference(net->mpls.platform_label);
> rt = rcu_dereference(platform_label[index & ((1 << 20) - 1)]);
> }
> return rt;
> }
>
> AKA a static mask will ensure that there is not a primitive that can be
> used to access all of memory.  That is max a 1 cycle slowdown in the
> code, which is a much better trade off.
>
> d) If we care more it is straight forward to modify
>resize_platform_label_table() to ensure that the size of the array
>is always a power of 2.
>
> e) The fact that a pointer is returned from the array and it is treated
>like a pointer would seem to provide a defense against the
>exfiltration technique of using the value read as an index into
>a small array, that user space code can probe aliased cached
>lines of, to see which value was dereferenced.
>
>
> So to this patch in particular.
> Nacked-by: "Eric W. Biederman" 
>
> This code path will be difficult to exploit.  This change messes with
> performance.  There are ways to make this code path useless while
> preserving the performance of the code.
>

Thanks, Eric understood. The discussion over the weekend  came to the
conclusion that using a mask will be the default approach. The
nospec_array_ptr() will be defined to something similar to the
following, originally from Linus and tweaked by Alexei and I:

#define __nospec_array_ptr(base, idx, sz)   \
({  \
union { typeof(&base[0]) _ptr; unsigned long _bit; } __u;   \
unsigned long _i = (idx);   \
unsigned long _m = (max);   \
unsigned long _mask = ~(long)(_m - 1 - _i) >> BITS_PER_LONG - 1;\
OPTIMIZER_HIDE_VAR(_mask);  \
__u._ptr = &base[_i & _mask];   \
__u._bit &= _mask;  \
__u._ptr;   \
})

Does that address your performance concerns?


Re: [v2] net: gianfar_ptp: move set_fipers() to spinlock protecting area

2018-01-08 Thread Richard Cochran
On Tue, Jan 09, 2018 at 11:02:33AM +0800, Yangbo Lu wrote:
> set_fipers() calling should be protected by spinlock in
> case that any interrupt breaks related registers setting
> and the function we expect. This patch is to move set_fipers()
> to spinlock protecting area in ptp_gianfar_adjtime().
> 
> Signed-off-by: Yangbo Lu 

Acked-by: Richard Cochran 


Re: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock protecting area

2018-01-08 Thread Richard Cochran
On Mon, Jan 08, 2018 at 10:53:40AM -0200, Fabio Estevam wrote:
> On Mon, Jan 8, 2018 at 8:13 AM, Yangbo Lu  wrote:
> > set_fipers() calling should be protected by spinlock.
> > This patch is to move set_fipers() to spinlock protecting
> > area in ptp_gianfar_adjtime() function.
> 
> It would be nice to explay why.

Maybe this is important?

/* Caller must hold etsects->lock. */
static void set_fipers(struct etsects *etsects)
{
set_alarm(etsects);
gfar_write(&etsects->regs->tmr_fiper1, etsects->tmr_fiper1);
gfar_write(&etsects->regs->tmr_fiper2, etsects->tmr_fiper2);
}

Thanks,
Richard



Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread lipeng (Y)



On 2018/1/9 11:06, David Miller wrote:

From: "lipeng (Y)" 
Date: Tue, 9 Jan 2018 10:48:04 +0800


So I think it is OK if you can revert [patch 12/20 ]("net: hns3: Add
packet statistics of netdev").

I think it is OK if you send the revert patch, which is what I
am asking for :-)

.

sure,  i will send the revert patch.
I have tested it in my local branch.

Thanks
Peng Li








RE: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock protecting area

2018-01-08 Thread Y.b. Lu


> -Original Message-
> From: Fabio Estevam [mailto:feste...@gmail.com]
> Sent: 2018年1月8日 20:54
> To: Y.b. Lu 
> Cc: Claudiu Manoil ; Richard Cochran
> ; netdev@vger.kernel.org; linux-kernel
> 
> Subject: Re: [PATCH] net: gianfar_ptp: move set_fipers() to spinlock 
> protecting
> area
> 
> On Mon, Jan 8, 2018 at 8:13 AM, Yangbo Lu  wrote:
> > set_fipers() calling should be protected by spinlock.
> > This patch is to move set_fipers() to spinlock protecting area in
> > ptp_gianfar_adjtime() function.
> 
> It would be nice to explay why.

[Y.b. Lu] Sent out v2 patch and explained in commit message :)
Thanks.

> 
> Thanks


[v2] net: gianfar_ptp: move set_fipers() to spinlock protecting area

2018-01-08 Thread Yangbo Lu
set_fipers() calling should be protected by spinlock in
case that any interrupt breaks related registers setting
and the function we expect. This patch is to move set_fipers()
to spinlock protecting area in ptp_gianfar_adjtime().

Signed-off-by: Yangbo Lu 
---
Changes for v2:
- explained why spinlock was needed in commit message.
---
 drivers/net/ethernet/freescale/gianfar_ptp.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c 
b/drivers/net/ethernet/freescale/gianfar_ptp.c
index 5441142..9f8d4f8 100644
--- a/drivers/net/ethernet/freescale/gianfar_ptp.c
+++ b/drivers/net/ethernet/freescale/gianfar_ptp.c
@@ -319,11 +319,10 @@ static int ptp_gianfar_adjtime(struct ptp_clock_info 
*ptp, s64 delta)
now = tmr_cnt_read(etsects);
now += delta;
tmr_cnt_write(etsects, now);
+   set_fipers(etsects);
 
spin_unlock_irqrestore(&etsects->lock, flags);
 
-   set_fipers(etsects);
-
return 0;
 }
 
-- 
1.7.1



[GIT] Networking

2018-01-08 Thread David Miller

Highlights:

1) Frag and UDP handling fixes in i40e driver, from Amritha Nambiar and
   Alexander Duyck.

2) Undo unintentional UAPI change in netfilter conntrack, from Florian
   Westphal.

3) Revert a change to how error codes are returned from
   dev_get_valid_name(), it broke some apps.

4) Cannot cache routes for ipv6 tunnels in the tunnel is ipv4/ipv6
   dual-stack.  From Eli Cooper.

5) Fix missed PMTU updates in geneve, from Xin Long.

6) Cure double free in macvlan, from Gao Feng.

7) Fix heap out-of-bounds write in rds_message_alloc_sgs(), from
   Mohamed Ghannam.

8) FEC bug fixes from FUgang Duan (mis-accounting of dev_id, missed
   deferral of probe when the regulator is not ready yet).

9) Missing DMA mapping error checks in 3c59x, from Neil Horman.

10) Turn off Broadcom tags for some b53 switches, from Florian
Fainelli.

11) Fix OOPS when get_target_net() is passed an SKB whose NETLINK_CB()
isn't initialized.  From Andrei Vagin.

12) Fix crashes in fib6_add(), from Wei Wang.

13) PMTU bug fixes in SCTP from Marcelo Ricardo Leitner.

Please pull, thanks a lot!

The following changes since commit 2758b3e3e630ba304fc4aca434d591e70e528298:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-12-28 
23:20:21 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to 50f3d740d376f664f6accc7e86c9afd8f1c7e1e4:

  sh_eth: fix TXALCR1 offsets (2018-01-08 14:31:38 -0500)


Alexander Duyck (1):
  i40e/i40evf: Account for frags split over multiple descriptors in check 
linearize

Amritha Nambiar (1):
  i40e: Remove UDP support for big buffer

Andrei Vagin (1):
  rtnetlink: give a user socket to get_target_net()

Arjun Vynipadath (1):
  cxgb4: Fix FW flash errors

Benjamin Poirier (1):
  e1000e: Fix e1000_check_for_copper_link_ich8lan return value.

Christophe JAILLET (1):
  mdio-sun4i: Fix a memory leak

David S. Miller (10):
  Revert "net: core: dev_get_valid_name is now the same as 
dev_alloc_name_ns"
  Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue
  Merge branch 'fec-clean-up-in-the-cases-of-probe-error'
  Merge branch 'ena-fixes'
  Merge branch '40GbE' of git://git.kernel.org/.../jkirsher/net-queue
  Merge tag 'mac80211-for-davem-2018-01-04' of 
git://git.kernel.org/.../jberg/mac80211
  Merge git://git.kernel.org/.../pablo/nf
  Merge tag 'linux-can-fixes-for-4.15-20180104' of 
git://git.kernel.org/.../mkl/linux-can
  Merge branch 'bnxt_en_fixes'
  Merge branch 'SCTP-PMTU-discovery-fixes'

Eduardo Otubo (1):
  xen-netfront: enable device after manual module load

Eli Cooper (1):
  ip6_tunnel: disable dst caching if tunnel is dual-stack

Felix Janda (1):
  uapi libc compat: add fallback for unsupported libcs

Florian Fainelli (1):
  net: dsa: b53: Turn off Broadcom tags for more switches

Florian Westphal (1):
  netfilter: uapi: correct UNTRACKED conntrack state bit number

Fugang Duan (3):
  net: fec: restore dev_id in the cases of probe error
  net: fec: defer probe if regulator is not ready
  net: fec: free/restore resource in related probe error pathes

Gao Feng (1):
  macvlan: Fix one possible double free

Gustavo A. R. Silva (1):
  phylink: mark expected switch fall-throughs in phylink_mii_ioctl

Hangbin Liu (1):
  netfilter: nf_tables: fix potential NULL-ptr deref in 
nf_tables_dump_obj_done()

Hao Chen (1):
  nl80211: Check for the required netlink attribute presence

Hauke Mehrtens (1):
  uapi/if_ether.h: prevent redefinition of struct ethhdr

Ido Schimmel (2):
  mlxsw: spectrum_router: Fix NULL pointer deref
  mlxsw: spectrum: Relax sanity checks during enslavement

Jacob Keller (1):
  i40e: don't remove netdev->dev_addr when syncing uc list

Jerome Brunet (1):
  net: stmmac: enable EEE in MII, GMII or RGMII only

Jiri Pirko (1):
  i40e: flower: Fix return value for unsupported offload

Johannes Berg (1):
  mac80211: mesh: drop frames appearing to be from us

Jon Maloy (1):
  tipc: fix problems with multipoint-to-point flow control

Luu An Phu (1):
  can: flex_can: Correct the checking for frame length in 
flexcan_start_xmit()

Marcelo Ricardo Leitner (3):
  sctp: fix error path in sctp_stream_init
  sctp: do not retransmit upon FragNeeded if PMTU discovery is disabled
  sctp: fix the handling of ICMP Frag Needed for too small MTUs

Martin Lederhilger (1):
  can: ems_usb: improve error reporting for error warning and error passive

Mohamed Ghannam (2):
  RDS: Heap OOB write in rds_message_alloc_sgs()
  RDS: null pointer dereference in rds_atomic_free_op

Neil Horman (1):
  3c59x: fix missing dma_mapping_error check and bad ring refill logic

Netanel Belgazal (2):
  net: ena: unmask MSI-X only after device initialization i

Re: [PATCH 16/18] net: mpls: prevent bounds-check bypass via speculative execution

2018-01-08 Thread Eric W. Biederman
Dan Williams  writes:

> Static analysis reports that 'index' may be a user controlled value that
> is used as a data dependency reading 'rt' from the 'platform_label'
> array.  In order to avoid potential leaks of kernel memory values, block
> speculative execution of the instruction stream that could issue further
> reads based on an invalid 'rt' value.


In detail.
a) This code is fast path packet forwarding code.  Introducing an
   unconditional pipeline stall is not ok.

   AKA either there is no speculation and so this is invulnerable
   or there is speculation and you are creating an unconditional
   pipeline stall here.

   My back of the napkin caluculations say that a pipeline stall
   is about 20 cycles.  Which is about the same length of time
   as a modern cache miss.

   On a good day this code will perform with 0 cache misses. On a less
   good day 1 cache miss.  Which means you are quite possibly doubling
   the runtime of mpls_forward.

b) The array is dynamically allocated which should provide some
   protection, as it will be more difficult to predict the address
   of the array which is needed to craft an malicious userspace value.

c) The code can be trivially modified to say:

static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
{
struct mpls_route *rt = NULL;

if (index < net->mpls.platform_labels) {
struct mpls_route __rcu **platform_label =
rcu_dereference(net->mpls.platform_label);
rt = rcu_dereference(platform_label[index & ((1 << 20) - 1)]);
}
return rt;
}

AKA a static mask will ensure that there is not a primitive that can be
used to access all of memory.  That is max a 1 cycle slowdown in the
code, which is a much better trade off.

d) If we care more it is straight forward to modify
   resize_platform_label_table() to ensure that the size of the array
   is always a power of 2.

e) The fact that a pointer is returned from the array and it is treated
   like a pointer would seem to provide a defense against the
   exfiltration technique of using the value read as an index into
   a small array, that user space code can probe aliased cached
   lines of, to see which value was dereferenced.


So to this patch in particular.
Nacked-by: "Eric W. Biederman" 

This code path will be difficult to exploit.  This change messes with
performance.  There are ways to make this code path useless while
preserving the performance of the code.

Eric

>
> Based on an original patch by Elena Reshetova.
>
> Cc: "David S. Miller" 
> Cc: Eric W. Biederman 
> Cc: netdev@vger.kernel.org
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Dan Williams 
> ---
>  net/mpls/af_mpls.c |   12 +++-
>  1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index 8ca9915befc8..ebcf0e246cfe 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -8,6 +8,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -77,12 +78,13 @@ static void rtmsg_lfib(int event, u32 label, struct 
> mpls_route *rt,
>  static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned 
> index)
>  {
>   struct mpls_route *rt = NULL;
> + struct mpls_route __rcu **platform_label =
> + rcu_dereference(net->mpls.platform_label);
> + struct mpls_route __rcu **rtp;
>  
> - if (index < net->mpls.platform_labels) {
> - struct mpls_route __rcu **platform_label =
> - rcu_dereference(net->mpls.platform_label);
> - rt = rcu_dereference(platform_label[index]);
> - }
> + if ((rtp = nospec_array_ptr(platform_label, index,
> + net->mpls.platform_labels)))
> + rt = rcu_dereference(*rtp);
>   return rt;
>  }
>  


Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread David Miller
From: "lipeng (Y)" 
Date: Tue, 9 Jan 2018 10:48:04 +0800

> So I think it is OK if you can revert [patch 12/20 ]("net: hns3: Add
> packet statistics of netdev").

I think it is OK if you send the revert patch, which is what I
am asking for :-)


Re: [PATCH net-next v2 1/6] net: Fix netdev_WARN_ONCE macro

2018-01-08 Thread David Miller
From: Joe Perches 
Date: Mon, 08 Jan 2018 18:42:01 -0800

> On Sun, 2018-01-07 at 12:08 +0200, Gal Pressman wrote:
>> netdev_WARN_ONCE is broken (whoops..), this fix will remove the
>> unnecessary "condition" parameter, add the missing comma and change
>> "arg" to "args".
>> 
>> Fixes: 375ef2b1f0d0 ("net: Introduce netdev_*_once functions")
>> Signed-off-by: Gal Pressman 
>> Reviewed-by: Saeed Mahameed 
>> ---
>>  include/linux/netdevice.h | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 352066e..5ff1ef9 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -4407,8 +4407,8 @@ do {   
>> \
>>  WARN(1, "netdevice: %s%s\n" format, netdev_name(dev),   \
>>   netdev_reg_state(dev), ##args)
>>  
>> -#define netdev_WARN_ONCE(dev, condition, format, arg...)\
>> -WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev)   \
>> +#define netdev_WARN_ONCE(dev, format, args...)  
>> \
>> +WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev),  \
> 
> You sure you want the newline before the format?

Hmmm, Gal please send me a relative fix for this.


Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing

2018-01-08 Thread Ed Swierk
On 1/6/18 10:57, Pravin Shelar wrote:
> On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk  wrote:
>>
>>
>> On Jan 5, 2018 22:17, "Pravin Shelar"  wrote:
>>
>> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk 
>> wrote:
>>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk 
>>> wrote:
 On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar  wrote:
> OVS already pull all required headers in skb linear data, so no need
> to redo all of it. only check required is the ip-checksum validation.
> I think we could avoid it in most of cases by checking skb length to
> ipheader length before verifying the ip header-checksum.

 Shouldn't the IP header checksum be verified even earlier, like in
 key_extract(), before actually using any of the fields in the IP
 header?
>>>
>>> Something like this for verifying the IP header checksum (not tested):
>>>
>> AFAIU openflow does not need this verification, so it is not required
>> in flow extract.
>>
>>
>> Okay. How about my proposed trimming implementation, caching the pad length
>> in the ovs cb?
>>
> Caching the length is not that simple, OVS actions can change the
> length. Keeping it consistent with packet would be more work, so lets
> calculate it in ovs-ct function.
> 

Something like this?

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a38c80e..282325d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4084,6 +4084,8 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 unsigned int transport_len,
 __sum16(*skb_chkf)(struct sk_buff *skb));
 
+int skb_network_trim(struct sk_buff *skb);
+
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 08f5740..c68e927 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4740,6 +4740,41 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_checksum_trimmed);
 
+/**
+ * skb_network_trim - trim skb to length specified by the network header
+ * @skb: the skb to trim
+ *
+ * Trims the skb to the length specified by the network header,
+ * removing any trailing padding. Leaves the skb alone if the protocol
+ * is not IP or IPv6. Frees the skb on error.
+ * 
+ * Caller needs to pull the skb to the network header.
+ */
+int skb_network_trim(struct sk_buff *skb)
+{
+   unsigned int len;
+   int err;
+
+   switch (skb->protocol) {
+   case htons(ETH_P_IP):
+   len = ntohs(ip_hdr(skb)->tot_len);
+   break;
+   case htons(ETH_P_IPV6):
+   len = sizeof(struct ipv6hdr)
+   + ntohs(ipv6_hdr(skb)->payload_len);
+   break;
+   default:
+   len = skb->len;
+   }
+
+   err = pskb_trim_rcsum(skb, len);
+   if (unlikely(err))
+   kfree_skb(skb);
+
+   return err;
+}
+EXPORT_SYMBOL(skb_network_trim);
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 {
net_warn_ratelimited("%s: received packets cannot be forwarded while 
LRO is enabled\n",
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b27c5c6..73418d3 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1112,6 +1112,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
 
+   err = skb_network_trim(skb);
+   if (err)
+   return err;
+
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
if (err)


Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread lipeng (Y)



On 2018/1/9 9:54, David Miller wrote:

From: Jakub Kicinski 
Date: Mon, 8 Jan 2018 17:50:21 -0800


Oh, I only noticed this extra misleading comment now.  Unless each queue
has a netdev, I don't see how these are per-queue.

If it isn't per-queue I want this change reverted.


[patch 12/20 ] add statistics of netdev for ethtool -S, netdev may have 
multi queue.


As discussion here,  it is duplicate to add this patch.


I revert  [patch 12/20 ] , and then test on my board, HNS3 basic function and 
ethtool -S work well.

So I think it is OK if you can revert  [patch 12/20 ]("net: hns3: Add packet 
statistics of netdev").


Thanks
Peng Li


.






Re: [PATCH net-next v2 1/6] net: Fix netdev_WARN_ONCE macro

2018-01-08 Thread Joe Perches
On Sun, 2018-01-07 at 12:08 +0200, Gal Pressman wrote:
> netdev_WARN_ONCE is broken (whoops..), this fix will remove the
> unnecessary "condition" parameter, add the missing comma and change
> "arg" to "args".
> 
> Fixes: 375ef2b1f0d0 ("net: Introduce netdev_*_once functions")
> Signed-off-by: Gal Pressman 
> Reviewed-by: Saeed Mahameed 
> ---
>  include/linux/netdevice.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 352066e..5ff1ef9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -4407,8 +4407,8 @@ do {
> \
>   WARN(1, "netdevice: %s%s\n" format, netdev_name(dev),   \
>netdev_reg_state(dev), ##args)
>  
> -#define netdev_WARN_ONCE(dev, condition, format, arg...) \
> - WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev)   \
> +#define netdev_WARN_ONCE(dev, format, args...)   
> \
> + WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev),  \

You sure you want the newline before the format?

> netdev_reg_state(dev), ##args)
>  
>  /* netif printk helpers, similar to netdev_printk */


RE: [PATCH net-next v2 5/6] bnx2x: Replace WARN_ONCE with netdev_WARN_ONCE

2018-01-08 Thread Kalluru, Sudarsana
-Original Message-
From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On 
Behalf Of Gal Pressman
Sent: 07 January 2018 15:39
To: David S. Miller 
Cc: netdev@vger.kernel.org; Tariq Toukan ; Saeed Mahameed 
; Gal Pressman ; Elior, Ariel 

Subject: [PATCH net-next v2 5/6] bnx2x: Replace WARN_ONCE with netdev_WARN_ONCE

Use the more appropriate netdev_WARN_ONCE instead of WARN_ONCE macro.

Signed-off-by: Gal Pressman 
Reviewed-by: Saeed Mahameed 
Cc: Ariel Elior 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 57eb26d..d7c98e8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -738,8 +738,9 @@ static void bnx2x_gro_receive(struct bnx2x *bp, struct 
bnx2x_fastpath *fp,
bnx2x_gro_csum(bp, skb, bnx2x_gro_ipv6_csum);
break;
default:
-   WARN_ONCE(1, "Error: FW GRO supports only IPv4/IPv6, 
not 0x%04x\n",
- be16_to_cpu(skb->protocol));
+   netdev_WARN_ONCE(bp->dev,
+"Error: FW GRO supports only 
IPv4/IPv6, not 0x%04x\n",
+be16_to_cpu(skb->protocol));
}
}
 #endif
-- 
2.7.4

Acked-by: Sudarsana Kalluru 


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Monday, January 8, 2018 9:32 PM
> To: Chris Mi 
> Cc: dsah...@gmail.com; marcelo.leit...@gmail.com;
> netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> Hi Chris,
> 
> On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote:
> > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote:
> > > > The insertion rate is improved more than 10%.
> > >
> > > Did you measure the effect of increasing batch sizes?
> > Yes. Even if we enlarge the batch size bigger than 10, there is no big
> improvement.
> > I think that's because current kernel doesn't process the requests in
> parallel.
> > If kernel processes the requests in parallel, I believe specifying a
> > bigger batch size will get a better result.
> 
> But throughput doesn't regress at some point, right? I think that's the 
> critical
> aspect when considering an "unlimited" batch size.
> 
> On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote:
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> If that's the easiest way, why not. At first, I thought one could maybe send
> the collected messages in chunks of suitable size, but that's probably not
> worth the effort.
I did a testing. If we read a million commands in memory and send them in 
chunks of 128,
we'll have a big regression. It takes about 21 seconds.




Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config

2018-01-08 Thread Alexei Starovoitov

On 1/8/18 4:02 PM, Jakub Kicinski wrote:

On Mon, 8 Jan 2018 22:59:04 +0100, Daniel Borkmann wrote:

@@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog 
*fp, int *err)
 */
*err = bpf_check_tail_call(fp);

+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+   if (!fp->jited)
+   *err = -ENOTSUPP;
+#endif


I think programs JITed for offload won't have fp->jited set, but
those are pretty safe from CPU bugs.  Should we set fp->jited = 1; in
bpf_prog_offload_compile()?  Just throwing "&& !bpf_prog_is_dev_bound()"
in here seems cleaner to me.


good catch. will fix in the v2.



Re: [PATCH net-next 1/3] ethtool: Ensure new ring parameters are within bounds during SRINGPARAM

2018-01-08 Thread Jakub Kicinski
On Mon,  8 Jan 2018 16:00:24 +0200, Tariq Toukan wrote:
> From: Eugenia Emantayev 
> 
> Add a sanity check to ensure that all requested ring parameters
> are within bounds, which should reduce errors in driver implementation.

(y)

> Signed-off-by: Eugenia Emantayev 
> Signed-off-by: Tariq Toukan 
> ---
>  net/core/ethtool.c | 13 +++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index 50a79203043b..9ea7cd52fde0 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -1704,14 +1704,23 @@ static int ethtool_get_ringparam(struct net_device 
> *dev, void __user *useraddr)
>  
>  static int ethtool_set_ringparam(struct net_device *dev, void __user 
> *useraddr)
>  {
> - struct ethtool_ringparam ringparam;
> + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
>  
> - if (!dev->ethtool_ops->set_ringparam)
> + if (!dev->ethtool_ops->set_ringparam || 
> !dev->ethtool_ops->get_ringparam)
>   return -EOPNOTSUPP;
>  
>   if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
>   return -EFAULT;
>  
> + dev->ethtool_ops->get_ringparam(dev, &max);

Perhaps check the return value here?  It's pretty unlikely but
get_ringparam may fail.

> + /* ensure new ring parameters are within the maximums */
> + if (ringparam.rx_pending > max.rx_max_pending ||
> + ringparam.rx_mini_pending > max.rx_mini_max_pending ||
> + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
> + ringparam.tx_pending > max.tx_max_pending)
> + return -EINVAL;
> +
>   return dev->ethtool_ops->set_ringparam(dev, &ringparam);
>  }
>  


Re: [PATCH 07/18] [media] uvcvideo: prevent bounds-check bypass via speculative execution

2018-01-08 Thread Dan Williams
On Mon, Jan 8, 2018 at 3:23 AM, Laurent Pinchart
 wrote:
> Hi Dan,
>
> Thank you for the patch.
>
> On Saturday, 6 January 2018 03:10:32 EET Dan Williams wrote:
>> Static analysis reports that 'index' may be a user controlled value that
>> is used as a data dependency to read 'pin' from the
>> 'selector->baSourceID' array. In order to avoid potential leaks of
>> kernel memory values, block speculative execution of the instruction
>> stream that could issue reads based on an invalid value of 'pin'.
>
> I won't repeat the arguments already made in the thread regarding having
> documented coverity rules for this, even if I agree with them.
>
>> Based on an original patch by Elena Reshetova.
>>
>> Cc: Laurent Pinchart 
>> Cc: Mauro Carvalho Chehab 
>> Cc: linux-me...@vger.kernel.org
>> Signed-off-by: Elena Reshetova 
>> Signed-off-by: Dan Williams 
>> ---
>>  drivers/media/usb/uvc/uvc_v4l2.c |7 +--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/media/usb/uvc/uvc_v4l2.c
>> b/drivers/media/usb/uvc/uvc_v4l2.c index 3e7e283a44a8..7442626dc20e 100644
>> --- a/drivers/media/usb/uvc/uvc_v4l2.c
>> +++ b/drivers/media/usb/uvc/uvc_v4l2.c
>> @@ -22,6 +22,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>
>>  #include 
>>  #include 
>> @@ -810,6 +811,7 @@ static int uvc_ioctl_enum_input(struct file *file, void
>> *fh, struct uvc_entity *iterm = NULL;
>>   u32 index = input->index;
>>   int pin = 0;
>> + __u8 *elem;
>>
>>   if (selector == NULL ||
>>   (chain->dev->quirks & UVC_QUIRK_IGNORE_SELECTOR_UNIT)) {
>> @@ -820,8 +822,9 @@ static int uvc_ioctl_enum_input(struct file *file, void
>> *fh, break;
>>   }
>>   pin = iterm->id;
>> - } else if (index < selector->bNrInPins) {
>> - pin = selector->baSourceID[index];
>> + } else if ((elem = nospec_array_ptr(selector->baSourceID, index,
>> + selector->bNrInPins))) {
>> + pin = *elem;
>>   list_for_each_entry(iterm, &chain->entities, chain) {
>>   if (!UVC_ENTITY_IS_ITERM(iterm))
>>   continue;
>
> (adding a bit more context)
>
>>   if (iterm->id == pin)
>>   break;
>>   }
>>   }
>>
>>   if (iterm == NULL || iterm->id != pin)
>>   return -EINVAL;
>>
>>   memset(input, 0, sizeof(*input));
>>   input->index = index;
>>   strlcpy(input->name, iterm->name, sizeof(input->name));
>>   if (UVC_ENTITY_TYPE(iterm) == UVC_ITT_CAMERA)
>>   input->type = V4L2_INPUT_TYPE_CAMERA;
>
> So pin is used to search for an entry in the chain->entities list. Entries in
> that list are allocated separately through kmalloc and can thus end up in
> different cache lines, so I agree we have an issue. However, this is mitigated
> by the fact that typical UVC devices have a handful (sometimes up to a dozen)
> entities, so an attacker would only be able to read memory values that are
> equal to the entity IDs used by the device. Entity IDs can be freely allocated
> but typically count continuously from 0. It would take a specially-crafted UVC
> device to be able to read all memory.
>
> On the other hand, as this is nowhere close to being a fast path, I think we
> can close this potential hole as proposed in the patch. So,
>
> Reviewed-by: Laurent Pinchart 

Thanks Laurent!

> Will you merge the whole series in one go, or would you like me to take the
> patch in my tree ? In the latter case I'll wait until the nospec_array_ptr()
> gets merged in mainline.

I'll track it for now. Until the 'nospec_array_ptr()' discussion
resolves there won't be a stabilized commit-id for you to base a
branch.


Re: Subject: [RFC][PATCH 04/11] stmmac: fix breakage in stmmac_hw_setup()

2018-01-08 Thread David Miller
From: Al Viro 
Date: Fri, 05 Jan 2018 19:31:58 +

> Since "drivers: net: stmmac: reworking the PCS code" ->pcs_ctrl_ane()
> had been taking iomem address to access as the first argument; its
> predecessor (->ctrl_ane()) used to take struct mac_device_info instead.
> 
> One of the callers had not been converted; as the result, instead of
> reading and modifying a word in card iomem we read and modify a word
> in (or near) the in-core strucct mac_device_info.
> 
> Fixes: 70523e639bf8 (drivers: net: stmmac: reworking the PCS code)
> Cc: sta...@vger.kernel.org
> Signed-off-by: Al Viro 

Yikes...

Al, can you split the bug fixes like this one into a separate series
for me to pull into my net GIT tree?

Don't include the pure annotation or partial endianness conversion
ones, those should go to net-next.


Re: [RESEND PATCH 2/3] net: ovs: remove unused hardirq.h

2018-01-08 Thread David Miller
From: "Yang Shi" 
Date: Tue, 09 Jan 2018 03:52:53 +0800

> Preempt counter APIs have been split out, currently, hardirq.h just
> includes irq_enter/exit APIs which are not used by openvswitch at all.
> 
> So, remove the unused hardirq.h.
> 
> Signed-off-by: Yang Shi 
> Acked-by: Pravin B Shelar 

Applied.


Re: [RESEND PATCH 1/3] net: caif: remove unused hardirq.h

2018-01-08 Thread David Miller
From: "Yang Shi" 
Date: Tue, 09 Jan 2018 03:52:52 +0800

> Preempt counter APIs have been split out, currently, hardirq.h just
> includes irq_enter/exit APIs which are not used by caif at all.
> 
> So, remove the unused hardirq.h.
> 
> Signed-off-by: Yang Shi 

Applied.


Re: [RESEND PATCH 3/3] net: tipc: remove unused hardirq.h

2018-01-08 Thread David Miller
From: "Yang Shi" 
Date: Tue, 09 Jan 2018 03:52:54 +0800

> Preempt counter APIs have been split out, currently, hardirq.h just
> includes irq_enter/exit APIs which are not used by TIPC at all.
> 
> So, remove the unused hardirq.h.
> 
> Signed-off-by: Yang Shi 
> Acked-by: Ying Xue 
> Tested-by: Ying Xue 

Applied.


Re: [PATCH 00/52] Netfilter/IPVS updates for net-next

2018-01-08 Thread David Miller
From: Pablo Neira Ayuso 
Date: Mon,  8 Jan 2018 21:19:08 +0100

> The following patchset contains Netfilter/IPVS updates for your
> net-next tree:
 ...
> 4) Add generic flow table offload infrastructure for nf_tables, this
>includes the netlink control plane and support for IPv4, IPv6 and
>mixed IPv4/IPv6 dataplanes. This comes with NAT support too. This
>patchset adds the IPS_OFFLOAD conntrack status bit to indicate that
>this flow has been offloaded.

Have driver maintainers signed off on your offload design and driver
interfaces?

I've pulled, but the above is really important to indicate when a new
offload feature is added.

Thanks.


Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread David Miller
From: Jakub Kicinski 
Date: Mon, 8 Jan 2018 17:50:21 -0800

> Oh, I only noticed this extra misleading comment now.  Unless each queue
> has a netdev, I don't see how these are per-queue.

If it isn't per-queue I want this change reverted.


Re: [PATCH net-next v2 0/6] Replace WARN_ONCE usages with netdev_WARN_ONCE

2018-01-08 Thread David Miller
From: Gal Pressman 
Date: Sun,  7 Jan 2018 12:08:34 +0200

> This series will fix an issue in netdev_WARN_ONCE, improve its formatting and
> replace drivers' usage of WARN_ONCE to netdev_WARN_ONCE.
> 
> Driver specific patches were compilation tested, in addition, functional 
> tested
> on Mellanox NIC.
> 
> v1->v2:
> - Addressed commit message comments in patch #1

Series applied, thanks.


Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread Jakub Kicinski
On Mon, 8 Jan 2018 17:46:02 -0800, Jakub Kicinski wrote:
> On Mon, 08 Jan 2018 20:39:13 -0500 (EST), David Miller wrote:
> > From: Jakub Kicinski 
> > Date: Mon, 8 Jan 2018 12:04:31 -0800
> >   
> > > Ugh, I so didn't review this in time :(  I think there is a consensus
> > > that we should avoid duplicating standard stats in ethtool.  Especially
> > > those old ones.  Like "collisions", I assume this is a modern NIC, are
> > > collisions still a thing?
> > 
> > There is no standard way to get per-queue values, and ethtool stats are
> > how pretty much every driver provides it.  
> 
> Right, agreed.  I'm only objecting to this patch (12/20), where we can
> see the telltale code like this:
> 
> + const struct rtnl_link_stats64 *net_stats;
> + struct rtnl_link_stats64 temp;
> +
> + net_stats = dev_get_stats(netdev, &temp);
> + for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) {
> + stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset;
> + *data++ = *(u64 *)stat;
> + }
> 
> Where:
> 
> +#define HNS3_NETDEV_STAT(_string, _member)   {   \
> + .stats_string = _string,\
> + .stats_offset = offsetof(struct rtnl_link_stats64, _member) \
> +}
> +
> +static const struct hns3_stats hns3_netdev_stats[] = {
> + /* Rx per-queue statistics */

Oh, I only noticed this extra misleading comment now.  Unless each queue
has a netdev, I don't see how these are per-queue.

> + HNS3_NETDEV_STAT("rx_packets", rx_packets),
> + HNS3_NETDEV_STAT("tx_packets", tx_packets),
> 
> etc.  IOW dumping struct rtnl_link_stats64 to ethtool -S member by
> member.
> 
> Let me put the netlink per-queue stats on my soft TODO list :)
> 


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Monday, January 8, 2018 11:40 PM
> To: Chris Mi 
> Cc: David Ahern ; Phil Sutter ;
> marcelo.leit...@gmail.com; netdev@vger.kernel.org; gerlitz...@gmail.com
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> On Mon, 8 Jan 2018 08:00:00 +
> Chris Mi  wrote:
> 
> > > >> I wonder whether specifying the batch size is necessary at all.
> > > >> Couldn't batch mode just collect messages until either EOF or an
> > > >> incompatible command is encountered which then triggers a commit
> > > >> to kernel? This might simplify code quite a bit.
> > > > That's a good suggestion.
> > >
> > > Thanks for your time on this, Chris.
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> 
> Use sendmmsg?
Maybe we can try that, but there is also a limit on it.


Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread Jakub Kicinski
On Mon, 08 Jan 2018 20:39:13 -0500 (EST), David Miller wrote:
> From: Jakub Kicinski 
> Date: Mon, 8 Jan 2018 12:04:31 -0800
> 
> > Ugh, I so didn't review this in time :(  I think there is a consensus
> > that we should avoid duplicating standard stats in ethtool.  Especially
> > those old ones.  Like "collisions", I assume this is a modern NIC, are
> > collisions still a thing?  
> 
> There is no standard way to get per-queue values, and ethtool stats are
> how pretty much every driver provides it.

Right, agreed.  I'm only objecting to this patch (12/20), where we can
see the telltale code like this:

+   const struct rtnl_link_stats64 *net_stats;
+   struct rtnl_link_stats64 temp;
+
+   net_stats = dev_get_stats(netdev, &temp);
+   for (i = 0; i < HNS3_NETDEV_STATS_COUNT; i++) {
+   stat = (u8 *)net_stats + hns3_netdev_stats[i].stats_offset;
+   *data++ = *(u64 *)stat;
+   }

Where:

+#define HNS3_NETDEV_STAT(_string, _member) {   \
+   .stats_string = _string,\
+   .stats_offset = offsetof(struct rtnl_link_stats64, _member) \
+}
+
+static const struct hns3_stats hns3_netdev_stats[] = {
+   /* Rx per-queue statistics */
+   HNS3_NETDEV_STAT("rx_packets", rx_packets),
+   HNS3_NETDEV_STAT("tx_packets", tx_packets),

etc.  IOW dumping struct rtnl_link_stats64 to ethtool -S member by
member.

Let me put the netlink per-queue stats on my soft TODO list :)



[PATCH] ath9k: add a quirk to set use_msi automatically

2018-01-08 Thread AceLan Kao
Some platform(BIOS) blocks legacy interrupts (INTx), and only allows MSI
for WLAN device. So adding a quirk to list those machines and set
use_msi automatically.
Adding the following platforms to the quirk.
   Dell Inspiron 24-3460
   Dell Inspiron 3472
   Dell Inspiron 14-3473
   Dell Vostro 3262
   Dell Vostro 15-3572

Signed-off-by: AceLan Kao 
---
 drivers/net/wireless/ath/ath9k/init.c | 53 +++
 1 file changed, 53 insertions(+)

diff --git a/drivers/net/wireless/ath/ath9k/init.c 
b/drivers/net/wireless/ath/ath9k/init.c
index 43adead..e479fae 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "ath9k.h"
@@ -96,6 +97,56 @@ static const struct ieee80211_tpt_blink ath9k_tpt_blink[] = {
 };
 #endif
 
+static int __init set_use_msi(const struct dmi_system_id *dmi)
+{
+   ath9k_use_msi = 1;
+   return 1;
+}
+
+static const struct dmi_system_id ath9k_quirks[] __initconst = {
+   {
+   .callback = set_use_msi,
+   .ident = "Dell Inspiron 24-3460",
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 24-3460"),
+   },
+   },
+   {
+   .callback = set_use_msi,
+   .ident = "Dell Vostro 3262",
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 3262"),
+   },
+   },
+   {
+   .callback = set_use_msi,
+   .ident = "Dell Inspiron 3472",
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 3472"),
+   },
+   },
+   {
+   .callback = set_use_msi,
+   .ident = "Dell Vostro 15-3572",
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 15-3572"),
+   },
+   },
+   {
+   .callback = set_use_msi,
+   .ident = "Dell Inspiron 14-3473",
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 14-3473"),
+   },
+   },
+   {}
+};
+
 static void ath9k_deinit_softc(struct ath_softc *sc);
 
 static void ath9k_op_ps_wakeup(struct ath_common *common)
@@ -1104,6 +1155,8 @@ static int __init ath9k_init(void)
goto err_pci_exit;
}
 
+   dmi_check_system(ath9k_quirks);
+
return 0;
 
  err_pci_exit:
-- 
2.7.4



Re: [PATCH net-next 12/20] net: hns3: Add packet statistics of netdev

2018-01-08 Thread David Miller
From: Jakub Kicinski 
Date: Mon, 8 Jan 2018 12:04:31 -0800

> Ugh, I so didn't review this in time :(  I think there is a consensus
> that we should avoid duplicating standard stats in ethtool.  Especially
> those old ones.  Like "collisions", I assume this is a modern NIC, are
> collisions still a thing?

There is no standard way to get per-queue values, and ethtool stats are
how pretty much every driver provides it.


Re: b43: Replace mdelay with msleep in b43_radio_2057_init_post

2018-01-08 Thread Jia-Ju Bai



On 2018/1/9 0:31, Larry Finger wrote:

On 01/08/2018 10:21 AM, Kalle Valo wrote:

Jia-Ju Bai  wrote:


b43_radio_2057_init_post is not called in an interrupt handler
nor holding a spinlock.
The function mdelay in it can be replaced with msleep, to reduce 
busy wait.


Signed-off-by: Jia-Ju Bai 


You submitted an identical patch a week earlier:

https://patchwork.kernel.org/patch/10137671/

How is this different? Also always add version number to the patch so 
that the

maintainers can follow the changes easily:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#patch_version_missing 



https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#changelog_missing 



I had negative comments on one of those due to the possibility of 
msleep(2) extending as long as 20 msec. Until the author, or someone 
else, can test that this is OK, then the mdelay(2) can only be 
replaced with usleep_range(2000, 3000).


NACK for both.

Larry



Sorry for my mistake.
I have sent a patch v2 using usleep_range(2000, 3000), and you can have 
a look :)



Thanks,
Jia-Ju Bai


[PATCH v2] b43: Replace mdelay with usleep_range in b43_radio_2057_init_post

2018-01-08 Thread Jia-Ju Bai
b43_radio_2057_init_post is not called in an interrupt handler
nor holding a spinlock.
The function mdelay in it can be replaced with usleep_range, 
to reduce busy wait.

Signed-off-by: Jia-Ju Bai 
---
v2:
* Replace mdelay with usleep_range, instead of msleep in v1.
  Thank Larry for good advice.
---
 drivers/net/wireless/broadcom/b43/phy_n.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c 
b/drivers/net/wireless/broadcom/b43/phy_n.c
index a5557d7..f2a2f41 100644
--- a/drivers/net/wireless/broadcom/b43/phy_n.c
+++ b/drivers/net/wireless/broadcom/b43/phy_n.c
@@ -1031,7 +1031,7 @@ static void b43_radio_2057_init_post(struct b43_wldev 
*dev)
 
b43_radio_set(dev, R2057_RFPLL_MISC_CAL_RESETN, 0x78);
b43_radio_set(dev, R2057_XTAL_CONFIG2, 0x80);
-   mdelay(2);
+   usleep_range(2000, 3000);
b43_radio_mask(dev, R2057_RFPLL_MISC_CAL_RESETN, ~0x78);
b43_radio_mask(dev, R2057_XTAL_CONFIG2, ~0x80);
 
-- 
1.7.9.5



Re: [PATCH net 0/3] Some sockopt optlen fixes

2018-01-08 Thread Neil Horman
On Mon, Jan 08, 2018 at 07:02:26PM -0200, Marcelo Ricardo Leitner wrote:
> Hangbin Liu reported that some SCTP sockopt are allowing the user to get
> the kernel to allocate really large buffers by not having a ceiling on
> optlen.
> 
> This patchset address this issue (in patch 2), replace an GFP_ATOMIC
> that isn't needed and avoid calculating the option size multiple times
> in some setsockopt.
> 
> Marcelo Ricardo Leitner (3):
>   sctp: GFP_ATOMIC is not needed in sctp_setsockopt_events
>   sctp: add a ceiling to optlen in some sockopts
>   sctp: make use of pre-calculated len
> 
>  net/sctp/socket.c | 28 +---
>  1 file changed, 21 insertions(+), 7 deletions(-)
> 
> -- 
> 2.14.3
> 
> 
Series
Acked-by: Neil Horman 



RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Monday, January 8, 2018 9:32 PM
> To: Chris Mi 
> Cc: dsah...@gmail.com; marcelo.leit...@gmail.com;
> netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> Hi Chris,
> 
> On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote:
> > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote:
> > > > The insertion rate is improved more than 10%.
> > >
> > > Did you measure the effect of increasing batch sizes?
> > Yes. Even if we enlarge the batch size bigger than 10, there is no big
> improvement.
> > I think that's because current kernel doesn't process the requests in
> parallel.
> > If kernel processes the requests in parallel, I believe specifying a
> > bigger batch size will get a better result.
> 
> But throughput doesn't regress at some point, right? I think that's the 
> critical
> aspect when considering an "unlimited" batch size.
Yes.
> 
> On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote:
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> If that's the easiest way, why not. At first, I thought one could maybe send
> the collected messages in chunks of suitable size, but that's probably not
> worth the effort.
OK.

-Chris


Re: [RFC PATCH bpf-next v2 0/4] Separate error injection table from kprobes

2018-01-08 Thread Masami Hiramatsu
On Thu, 4 Jan 2018 11:07:16 -0500
Josef Bacik  wrote:

> On Tue, Dec 26, 2017 at 04:46:28PM +0900, Masami Hiramatsu wrote:
> > Hi Josef and Alexei,
> > 
> > Here are the 2nd version of patches to moving error injection
> > table from kprobes. In this series I did a small fixes and
> > add function-based fault injection.
> > 
> > Here is the previous version:
> > 
> > https://lkml.org/lkml/2017/12/22/554
> > 
> > There are 2 main reasons why I separate it from kprobes.
> > 
> >  - kprobes users can modify execution path not only at 
> >error-injection whitelist functions but also other
> >functions. I don't like to suggest user that such
> >limitation is from kprobes itself.
> > 
> >  - This error injection information is also useful for
> >ftrace (function-hook) and livepatch. It should not
> >be limited by CONFIG_KPROBES.
> > 
> > So I introduced CONFIG_FUNCTION_ERROR_INJECTION for this feature.
> > Also CONFIG_FAIL_FUNCTION is added, which provides function-based
> > error injection interface via debugfs following fault-injection
> > framework. See [4/4].
> > 
> > Any thoughts?
> 
> Sorry Masami, I've been on vacation for the last two weeks.  This approach is
> fine by me, if we want to allow other mechanisms other than bpf to use this
> functionality then hooray.  I'll do a proper review when you post v3, just
> wanted to let you know I wasn't ignoring you.  Thanks,

Yeah, thank you for the kindful notice ;)

BTW, could you tell me how I can run your test case?

When I tried to build the tests (samples/bpf) I got below error and stopped.

[mhiramat@devbox bpf]$ LANG=C make 
make -C ../../ /home/mhiramat/ksrc/linux/samples/bpf/
make[1]: Entering directory '/home/mhiramat/ksrc/linux'
  CHK include/config/kernel.release
  CHK include/generated/uapi/linux/version.h
  CHK include/generated/utsrelease.h
  CHK include/generated/bounds.h
  CHK include/generated/timeconst.h
  CHK include/generated/asm-offsets.h
  CALLscripts/checksyscalls.sh
  DESCEND  objtool
  CHK scripts/mod/devicetable-offsets.h
  HOSTCC  /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.o
/home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.c:39:8: error: redefinition 
of 'struct list_head'
 struct list_head {
^
In file included from /home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.c:9:0:
./tools/include/linux/types.h:69:8: note: originally defined here
 struct list_head {
^
make[2]: *** [scripts/Makefile.host:107: 
/home/mhiramat/ksrc/linux/samples/bpf/test_lru_dist.o] Error 1
make[1]: *** [Makefile:1675: /home/mhiramat/ksrc/linux/samples/bpf/] Error 2
make[1]: Leaving directory '/home/mhiramat/ksrc/linux'
make: *** [Makefile:204: all] Error 2


Thank you,

-- 
Masami Hiramatsu 


Re: linux-next: manual merge of the net-next tree with the bpf tree

2018-01-08 Thread Alexei Starovoitov
On Tue, Jan 09, 2018 at 11:21:25AM +1100, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the net-next tree got a conflict in:
> 
>   tools/testing/selftests/bpf/test_align.c
> 
> between commit:
> 
>   2b36047e7889 ("selftests/bpf: fix test_align")
> 
> from the bpf tree and commit:
> 
>   6a28b446b7d2 ("selftests/bpf: adjust test_align expected output")
> 
> from the net-next tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell
> 
> diff --cc tools/testing/selftests/bpf/test_align.c
> index 471bbbdb94db,fe916d29e166..
> --- a/tools/testing/selftests/bpf/test_align.c
> +++ b/tools/testing/selftests/bpf/test_align.c
> @@@ -473,8 -473,28 +473,8 @@@ static struct bpf_align_test tests[] = 
>   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
>   .result = REJECT,
>   .matches = {
> - {4, "R5=pkt(id=0,off=0,r=0,imm=0)"},
> + {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},

thanks. That's correct resolution.



linux-next: manual merge of the net-next tree with the bpf tree

2018-01-08 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  tools/testing/selftests/bpf/test_align.c

between commit:

  2b36047e7889 ("selftests/bpf: fix test_align")

from the bpf tree and commit:

  6a28b446b7d2 ("selftests/bpf: adjust test_align expected output")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc tools/testing/selftests/bpf/test_align.c
index 471bbbdb94db,fe916d29e166..
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@@ -473,8 -473,28 +473,8 @@@ static struct bpf_align_test tests[] = 
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = REJECT,
.matches = {
-   {4, "R5=pkt(id=0,off=0,r=0,imm=0)"},
+   {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
 -  /* ptr & 0x40 == either 0 or 0x40 */
 -  {5, "R5_w=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"},
 -  /* ptr << 2 == unknown, (4n) */
 -  {7, 
"R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0;
 0xfffc))"},
 -  /* (4n) + 14 == (4n+2).  We blow our bounds, because
 -   * the add could overflow.
 -   */
 -  {8, "R5=inv(id=0,var_off=(0x2; 0xfffc))"},
 -  /* Checked s>=0 */
 -  {10, 
"R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 
0x7ffc))"},
 -  /* packet pointer + nonnegative (4n+2) */
 -  {12, 
"R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2;
 0x7ffc))"},
 -  {14, 
"R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2;
 0x7ffc))"},
 -  /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
 -   * We checked the bounds, but it might have been able
 -   * to overflow if the packet pointer started in the
 -   * upper half of the address space.
 -   * So we did not get a 'range' on R6, and the access
 -   * attempt will fail.
 -   */
 -  {16, 
"R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2;
 0x7ffc))"},
 +  /* R5 bitwise operator &= on pointer prohibited */
}
},
{


Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok

2018-01-08 Thread Linus Torvalds
On Mon, Jan 8, 2018 at 3:53 PM, Dan Williams  wrote:
>
> I've been thinking the "and" is only suitable for the array bounds
> check, for get_user() we're trying to block speculation past
> access_ok() at which point we can only do the lfence?

Well, we *could* do the "and", at least for the simple cases (ie the
true "get_user()" that integrates the access_ok with the access).

IOW, mainly the code in arch/x86/lib/getuser.S.

But it probably is a lot simpler to just add the "lfence" to ASM_STAC,
because by definition those cases don't tend to be the truly critical
ones - people who use those functions tend to do one or two accesses,
and the real cost is likely the I$ misses and the D$ miss to get
current->addr_limit. Not to mention the "stac" itself, which is much
more expensive than the access on current microarchitectures.

But something like this *might* work:

   index c97d935a29e8..7fa3d293beaf 100644
   --- a/arch/x86/lib/getuser.S
   +++ b/arch/x86/lib/getuser.S
   @@ -38,8 +38,11 @@
   .text
ENTRY(__get_user_1)
   mov PER_CPU_VAR(current_task), %_ASM_DX
   -   cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
   +   mov TASK_addr_limit(%_ASM_DX),%_ASM_DX
   +   cmp %_ASM_DX,%_ASM_AX
   jae bad_get_user
   +   or $0xfff,%_ASM_DX
   +   and %_ASM_DX,%_ASM_AX
   ASM_STAC
1: movzbl (%_ASM_AX),%edx
   xor %eax,%eax

(this only does the one-byte case - the 2/4/8 byte cases are exactly the same).

The above is completely untested and might have some stupid
thinko/typo, so take it purely as a "example patch" to show the
concept, rather than actually do it.

But just adding "lfence" to the existing ASM_STAC is a hell of a lot
easier, and the performance difference between that trivial patch and
the above "let's be clever with 'and'" might not be measurable.

I really have no idea how expensive lfence might actually end up being
in practice. It's possible that lfence is actually fairly cheap in
kernel code, since we tend to not have very high IPC anyway.

 Linus


Re: [PATCH v2] openvswitch: Trim off padding before L3+ netfilter processing

2018-01-08 Thread Pravin Shelar
On Sat, Jan 6, 2018 at 10:57 AM, Pravin Shelar  wrote:
> On Fri, Jan 5, 2018 at 10:59 PM, Ed Swierk  wrote:
>>
>>
>> On Jan 5, 2018 22:17, "Pravin Shelar"  wrote:
>>
>> On Fri, Jan 5, 2018 at 3:20 PM, Ed Swierk 
>> wrote:
>>> On Fri, Jan 5, 2018 at 10:14 AM, Ed Swierk 
>>> wrote:
 On Thu, Jan 4, 2018 at 7:36 PM, Pravin Shelar  wrote:
> OVS already pull all required headers in skb linear data, so no need
> to redo all of it. only check required is the ip-checksum validation.
> I think we could avoid it in most of cases by checking skb length to
> ipheader length before verifying the ip header-checksum.

 Shouldn't the IP header checksum be verified even earlier, like in
 key_extract(), before actually using any of the fields in the IP
 header?
>>>
>>> Something like this for verifying the IP header checksum (not tested):
>>>
>> AFAIU openflow does not need this verification, so it is not required
>> in flow extract.
>>
>>
>> Okay. How about my proposed trimming implementation, caching the pad length
>> in the ovs cb?
>>
> Caching the length is not that simple, OVS actions can change the
> length. Keeping it consistent with packet would be more work, so lets
> calculate it in ovs-ct function.

You could make it specific for skb-len-trimming, something like
boolean flag. so that it is easy to reason with.


Re: [PATCH net-next 2/2] openvswitch: add erspan version II support

2018-01-08 Thread Pravin Shelar
On Fri, Jan 5, 2018 at 2:29 PM, William Tu  wrote:
> The patch adds support for configuring the erspan version II
> fields for openvswitch.
>
The patch looks good, But it could change userspace API for
OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, how are we going to handle
compatibility?

> Signed-off-by: William Tu 
> ---
>  include/uapi/linux/openvswitch.h |  12 +++-
>  net/openvswitch/flow_netlink.c   | 125 
> +++
>  2 files changed, 126 insertions(+), 11 deletions(-)
>
> diff --git a/include/uapi/linux/openvswitch.h 
> b/include/uapi/linux/openvswitch.h
> index 4265d7f9e1f2..3b1950c59a0c 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -273,6 +273,16 @@ enum {
>
>  #define OVS_VXLAN_EXT_MAX (__OVS_VXLAN_EXT_MAX - 1)
>
> +enum {
> +   OVS_ERSPAN_OPT_UNSPEC,
> +   OVS_ERSPAN_OPT_IDX, /* be32 index */
> +   OVS_ERSPAN_OPT_VER, /* u8 version number */
> +   OVS_ERSPAN_OPT_DIR, /* u8 direction */
> +   OVS_ERSPAN_OPT_HWID,/* u8 hardware ID */
> +   __OVS_ERSPAN_OPT_MAX,
> +};
> +
> +#define OVS_ERSPAN_OPT_MAX (__OVS_ERSPAN_OPT_MAX - 1)
>
>  /* OVS_VPORT_ATTR_OPTIONS attributes for tunnels.
>   */
> @@ -363,7 +373,7 @@ enum ovs_tunnel_key_attr {
> OVS_TUNNEL_KEY_ATTR_IPV6_SRC,   /* struct in6_addr src IPv6 
> address. */
> OVS_TUNNEL_KEY_ATTR_IPV6_DST,   /* struct in6_addr dst IPv6 
> address. */
> OVS_TUNNEL_KEY_ATTR_PAD,
> -   OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,/* be32 ERSPAN index. */
> +   OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,/* Nested OVS_ERSPAN_OPT_* */
> __OVS_TUNNEL_KEY_ATTR_MAX
>  };
>
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index bce1f78b0de5..696198cf3765 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -334,8 +334,10 @@ size_t ovs_tun_key_attr_size(void)
>  * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
>  */
> + nla_total_size(2)/* OVS_TUNNEL_KEY_ATTR_TP_SRC */
> -   + nla_total_size(2)/* OVS_TUNNEL_KEY_ATTR_TP_DST */
> -   + nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
> +   + nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
> +   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with
> +* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
> +*/
>  }
>
>  static size_t ovs_nsh_key_attr_size(void)
> @@ -386,6 +388,13 @@ static const struct ovs_len_tbl 
> ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] =
> [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) },
>  };
>
> +static const struct ovs_len_tbl ovs_erspan_opt_lens[OVS_ERSPAN_OPT_MAX + 1] 
> = {
> +   [OVS_ERSPAN_OPT_IDX]= { .len = sizeof(u32) },
> +   [OVS_ERSPAN_OPT_VER]= { .len = sizeof(u8) },
> +   [OVS_ERSPAN_OPT_DIR]= { .len = sizeof(u8) },
> +   [OVS_ERSPAN_OPT_HWID]   = { .len = sizeof(u8) },
> +};
> +
>  static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX 
> + 1] = {
> [OVS_TUNNEL_KEY_ATTR_ID]= { .len = sizeof(u64) },
> [OVS_TUNNEL_KEY_ATTR_IPV4_SRC]  = { .len = sizeof(u32) },
> @@ -402,7 +411,8 @@ static const struct ovs_len_tbl 
> ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
> .next = 
> ovs_vxlan_ext_key_lens },
> [OVS_TUNNEL_KEY_ATTR_IPV6_SRC]  = { .len = sizeof(struct 
> in6_addr) },
> [OVS_TUNNEL_KEY_ATTR_IPV6_DST]  = { .len = sizeof(struct 
> in6_addr) },
> -   [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
> +   [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = OVS_ATTR_NESTED,
> +   .next = ovs_erspan_opt_lens },
>  };
>
>  static const struct ovs_len_tbl
> @@ -640,16 +650,78 @@ static int erspan_tun_opt_from_nlattr(const struct 
> nlattr *attr,
>  {
> unsigned long opt_key_offset;
> struct erspan_metadata opts;
> +   struct nlattr *a;
> +   u16 hwid, dir;
> +   int rem;
>
> BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
>
> memset(&opts, 0, sizeof(opts));
> -   opts.u.index = nla_get_be32(attr);
> +   nla_for_each_nested(a, attr, rem) {
> +   int type = nla_type(a);
>
> -   /* Index has only 20-bit */
> -   if (ntohl(opts.u.index) & ~INDEX_MASK) {
> -   OVS_NLERR(log, "ERSPAN index number %x too large.",
> - ntohl(opts.u.index));
> +   if (type > OVS_ERSPAN_OPT_MAX) {
> +   OVS_NLERR(log, "ERSPAN option %d out of range max %d",
> + type, OVS_ERSPAN_OPT_MAX);
> +   return -EINVAL;
> +   }
> +
> +   if (!check_attr_len(nla_len(a),
> +  

Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config

2018-01-08 Thread Jakub Kicinski
On Mon, 8 Jan 2018 22:59:04 +0100, Daniel Borkmann wrote:
> > @@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct 
> > bpf_prog *fp, int *err)
> >  */
> > *err = bpf_check_tail_call(fp);
> >  
> > +#ifdef CONFIG_BPF_JIT_ALWAYS_ON
> > +   if (!fp->jited)
> > +   *err = -ENOTSUPP;
> > +#endif  

I think programs JITed for offload won't have fp->jited set, but
those are pretty safe from CPU bugs.  Should we set fp->jited = 1; in
bpf_prog_offload_compile()?  Just throwing "&& !bpf_prog_is_dev_bound()"
in here seems cleaner to me.

FWIW if you have netdevsim compiled and recent iproute2, this will
work to check:

# ip link add type netdevsim
# ip link set netdevsim0 xdpoffload obj ~/xdp/pass.o


Re: [PATCH v3 bpf] bpf: prevent out-of-bounds speculation

2018-01-08 Thread Daniel Borkmann
On 01/08/2018 02:33 AM, Alexei Starovoitov wrote:
> Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
> memory accesses under a bounds check may be speculated even if the
> bounds check fails, providing a primitive for building a side channel.
> 
> To avoid leaking kernel data round up array-based maps and mask the index
> after bounds check, so speculated load with out of bounds index will load
> either valid value from the array or zero from the padded area.
> 
> Unconditionally mask index for all array types even when max_entries
> are not rounded to power of 2 for root user.
> When map is created by unpriv user generate a sequence of bpf insns
> that includes AND operation to make sure that JITed code includes
> the same 'index & index_mask' operation.
> 
> If prog_array map is created by unpriv user replace
>   bpf_tail_call(ctx, map, index);
> with
>   if (index >= max_entries) {
> index &= map->index_mask;
> bpf_tail_call(ctx, map, index);
>   }
> (along with roundup to power 2) to prevent out-of-bounds speculation.
> There is secondary redundant 'if (index >= max_entries)' in the interpreter
> and in all JITs, but they can be optimized later if necessary.
> 
> Other array-like maps (cpumap, devmap, sockmap, perf_event_array, 
> cgroup_array)
> cannot be used by unpriv, so no changes there.
> 
> That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on
> all architectures with and without JIT.
> 
> v2->v3:
> Daniel noticed that attack potentially can be crafted via syscall commands
> without loading the program, so add masking to those paths as well.
> 
> Signed-off-by: Alexei Starovoitov 
> Acked-by: John Fastabend 

Applied to bpf tree, thanks Alexei!


Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok

2018-01-08 Thread Dan Williams
On Mon, Jan 8, 2018 at 3:44 PM, Linus Torvalds
 wrote:
> On Mon, Jan 8, 2018 at 1:09 PM, Dan Williams  wrote:
>> On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds
>>  wrote:
>>> On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams  
>>> wrote:

 I assume if we put this in uaccess_begin() we also need audit for
 paths that use access_ok but don't do on to call uaccess_begin()? A
 quick glance shows a few places where we are open coding the stac().
 Perhaps land the lfence in stac() directly?
>>>
>>> Yeah, we should put it in uaccess_begin(), and in the actual user
>>> accessor helpers that do stac. Some of them probably should be changed
>>> to use uaccess_begin() instead while at it.
>>>
>>> One question for the CPU people: do we actually care and need to do
>>> this for things that might *write* to something? The speculative write
>>> obviously is killed, but does it perhaps bring in a cacheline even
>>> when killed?
>>
>> As far as I understand a write could trigger a request-for-ownership
>> read for the target cacheline.
>
> Oh, absolutely.
>
> I just wonder at what point that happens.
>
> Honestly, trying to get exclusive access to a cacheline can be _very_
> expensive (not just for the local thread), so I would actually expect
> that doing so for speculative writes is actually bad for performance.
>
> That's doubly true because - unlike reads - there is no critical
> latency issue, so trying to get the cache access started as early as
> possible simply isn't all that important.
>
> So I suspect that a write won't actually try to allocate the cacheline
> until the write has actually retired.
>
> End result: writes - unlike reads - *probably* will not speculatively
> perturb the cache with speculative write addresses.
>
>> Even though writes can trigger reads, as far as I can see the write
>> needs to be dependent on the first out-of-bounds read
>
> Yeah. A write on its own wouldn't matter, even if it were to perturb
> the cache state, because the address already comes from user space, so
> there's no new information in the cache perturbation for the attacker.
>
> But that all implies that we shouldn't need the lfence for the
> "put_user()" case, only for the get_user() (where the value we read
> would then perhaps be used to do another access).
>
> So we want to add the lfence (or "and") to get_user(), but not
> necessarily put_user().

Yes, perhaps __uaccess_begin_get() and __uaccess_begin_put() to keep
things separate?

> Agreed?

I've been thinking the "and" is only suitable for the array bounds
check, for get_user() we're trying to block speculation past
access_ok() at which point we can only do the lfence?


Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok

2018-01-08 Thread Linus Torvalds
On Mon, Jan 8, 2018 at 1:09 PM, Dan Williams  wrote:
> On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds
>  wrote:
>> On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams  
>> wrote:
>>>
>>> I assume if we put this in uaccess_begin() we also need audit for
>>> paths that use access_ok but don't do on to call uaccess_begin()? A
>>> quick glance shows a few places where we are open coding the stac().
>>> Perhaps land the lfence in stac() directly?
>>
>> Yeah, we should put it in uaccess_begin(), and in the actual user
>> accessor helpers that do stac. Some of them probably should be changed
>> to use uaccess_begin() instead while at it.
>>
>> One question for the CPU people: do we actually care and need to do
>> this for things that might *write* to something? The speculative write
>> obviously is killed, but does it perhaps bring in a cacheline even
>> when killed?
>
> As far as I understand a write could trigger a request-for-ownership
> read for the target cacheline.

Oh, absolutely.

I just wonder at what point that happens.

Honestly, trying to get exclusive access to a cacheline can be _very_
expensive (not just for the local thread), so I would actually expect
that doing so for speculative writes is actually bad for performance.

That's doubly true because - unlike reads - there is no critical
latency issue, so trying to get the cache access started as early as
possible simply isn't all that important.

So I suspect that a write won't actually try to allocate the cacheline
until the write has actually retired.

End result: writes - unlike reads - *probably* will not speculatively
perturb the cache with speculative write addresses.

> Even though writes can trigger reads, as far as I can see the write
> needs to be dependent on the first out-of-bounds read

Yeah. A write on its own wouldn't matter, even if it were to perturb
the cache state, because the address already comes from user space, so
there's no new information in the cache perturbation for the attacker.

But that all implies that we shouldn't need the lfence for the
"put_user()" case, only for the get_user() (where the value we read
would then perhaps be used to do another access).

So we want to add the lfence (or "and") to get_user(), but not
necessarily put_user().

Agreed?

  Linus


Re: [PATCH bpf] selftests/bpf: fix test_align

2018-01-08 Thread Alexei Starovoitov

On 1/8/18 8:38 AM, Edward Cree wrote:

On 05/01/18 23:02, Alexei Starovoitov wrote:

since commit 82abbf8d2fc4 the verifier rejects the bit-wise
arithmetic on pointers earlier.
The test 'dubious pointer arithmetic' now has less output to match on.
Adjust it.

Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers")
Reported-by: kernel test robot 
Signed-off-by: Alexei Starovoitov 
---
 tools/testing/selftests/bpf/test_align.c | 22 +-
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_align.c 
b/tools/testing/selftests/bpf/test_align.c
index 8591c89c0828..471bbbdb94db 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -474,27 +474,7 @@ static struct bpf_align_test tests[] = {
.result = REJECT,
.matches = {
{4, "R5=pkt(id=0,off=0,r=0,imm=0)"},
-   /* ptr & 0x40 == either 0 or 0x40 */
-   {5, "R5=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"},
-   /* ptr << 2 == unknown, (4n) */
-   {7, 
"R5=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0;
 0xfffc))"},
-   /* (4n) + 14 == (4n+2).  We blow our bounds, because
-* the add could overflow.
-*/
-   {8, "R5=inv(id=0,var_off=(0x2; 0xfffc))"},
-   /* Checked s>=0 */
-   {10, 
"R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 
0x7ffc))"},
-   /* packet pointer + nonnegative (4n+2) */
-   {12, 
"R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 
0x7ffc))"},
-   {14, 
"R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 
0x7ffc))"},
-   /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
-* We checked the bounds, but it might have been able
-* to overflow if the packet pointer started in the
-* upper half of the address space.
-* So we did not get a 'range' on R6, and the access
-* attempt will fail.
-*/
-   {16, 
"R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 
0x7ffc))"},
+   /* R5 bitwise operator &= on pointer prohibited */
}
},
{

Rather than neutering this test, we should change it to keep the part where
 it tests that a large pkt_ptr offset prevents us getting a reg->range.
Specifically, in this test we have
r2 = pkt
r5 = large unknown scalar
r6 = r2 + r5
r4 = r6 + 4
Then we check r4 < pkt_end, which normally would give r6->range = 4, but in
 this case must not do so since r6 could be (u64)(-2) in which case r4 = 2
 < pkt_end despite r6 not pointing into the packet.
AFAICT there is not other coverage of this case in test_align, and I don't
 recall such a test being in test_verifier either.  So please instead replace
 the insns that do prohibited ops on pointers with some other way of creating
 a large unknown scalar, and keep the rest of the test case intact.


makes sense. will send a follow up patch when security dust settles.



Re: [PATCH bpf] bpf: prevent out-of-bounds speculation

2018-01-08 Thread Alexei Starovoitov

On 1/8/18 9:05 AM, Mark Rutland wrote:

Hi Alexei,

On Thu, Jan 04, 2018 at 08:28:11PM -0800, Alexei Starovoitov wrote:

From: Alexei Starovoitov 

Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
memory accesses under a bounds check may be speculated even if the
bounds check fails, providing a primitive for building a side channel.

To avoid leaking kernel data round up array-based maps and mask the index
after bounds check, so speculated load with out of bounds index will load
either valid value from the array or zero from the padded area.


Thanks for putting this together, this certainly looks neat.

I'm a little worried that in the presence of some CPU/compiler
optimisations, the masking may effectively be skipped under speculation.
So I'm not sure how robust this is going to be.

More on that below.


To avoid duplicating map_lookup functions for root/unpriv always generate
a sequence of bpf instructions equivalent to map_lookup function for
array and array_of_maps map types when map was created by unpriv user.
And unconditionally mask index for percpu_array, since it's fast enough,
even when max_entries are not rounded to power of 2 for root user,
since percpu_array doesn't have map_gen_lookup callback yet.


Is there a noticeable slowdown from the masking? Can't we always have
that in place?


right. Please see v3 version:
https://patchwork.ozlabs.org/patch/856645/
Daniel noticed that speculation can happen without program being
loaded and we need to tighten the path via syscall as well.
so v3 is doing masking for all array types unconditionally.
The perf cost is within noise for interpreter and
not seen with JITed root code, since gen_lookup does not
emit AND for root.


@@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map 
*map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;

-   return this_cpu_ptr(array->pptrs[index]);
+   return this_cpu_ptr(array->pptrs[index & array->index_mask]);


As above, I think this isn't necessarily robust, as CPU/compiler
optimisations can break the dependency on the index_mask, allowing
speculation without a mask.

e.g. a compiler could re-write this as:

if (array->index_mask != 0x)
index &= array->index_mask;
return this_cpu_ptr(array->pptrs[index]);

... which would allow an unmasked index to be used in speculated paths.


prior to kernel I've been working on sun, gcc, llvm compilers
and I've never seen such optimization ever proposed for AND.
It makes no sense.

For heavy ALU like div/mod and calls compiler does indeed try
to predict the value. de-virtualization is an example optimization
for indirect calls. Intel compiler pioneered this approach back in 2000.

Compilers can also optimize "div by X" into
if (x == const)
  unroll div by const into something faster;
else
  div by X

Such optimizations are rarely done without profile feedback,
since branch is costly the compiler will add a branch only if there
is a clear win from introducing it instead of doing the operation.
For and, or, shift, add, sub there is never a case to do so.
Instead compiler is always trying to remove branches instead of
introducing them.


Similar cases could occur with some CPU implementations. For example, HW
value-prediction could result in the use of an all-ones mask under
speculation.


please see the paper that Alan mentioned.
HW value speculation predicts likely valid values. It makes no sense
for HW to continue speculative execution with random value.
Consider array[index & index_mask]
if load index_mask stalls and cpu decides to continue speculation with
random value (both zero and  are considered random) it will proceed
through AND and second load will populate the precious cache
with completely irrelevant data.
Such cpu will be slower with speculative execution than without,
since it populates the caches with random data.


I think that we may need to be able to provide an arch-specific
pointer sanitization sequence (though we could certainly have masking as
the default).


I still don't understand where this paranoia is coming from.
Kernel doesn't need to kill speculation. It needs to manage it.


I have a rough idea as to how that could be plumbed into the JIT. First
I need to verify the sequence I have in mind for arm/arm64 is
sufficient.


hmm? the patch provided (both v2 and v3) doesn't need any JIT changes
on either x64, arm, etc.
gen_lookup() emits BPF_AND on index that JIT converts into actual AND
in native instruction set.



[PATCH next-queue 2/2] ixgbe: add unlikely notes to tx fastpath expressions

2018-01-08 Thread Shannon Nelson
Add unlikely() to a few error checking expressions in the Tx
offload handling.

Suggested-by: Yanjun Zhu 
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 57c10e6..3d069a2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -749,28 +749,28 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
struct xfrm_state *xs;
struct tx_sa *tsa;
 
-   if (!first->skb->sp->len) {
+   if (unlikely(!first->skb->sp->len)) {
netdev_err(tx_ring->netdev, "%s: no xfrm state len = %d\n",
   __func__, first->skb->sp->len);
return 0;
}
 
xs = xfrm_input_state(first->skb);
-   if (!xs) {
+   if (unlikely(!xs)) {
netdev_err(tx_ring->netdev, "%s: no xfrm_input_state() xs = 
%p\n",
   __func__, xs);
return 0;
}
 
itd->sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX;
-   if (itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT) {
+   if (unlikely(itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) {
netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n",
   __func__, itd->sa_idx, xs->xso.offload_handle);
return 0;
}
 
tsa = &ipsec->tx_tbl[itd->sa_idx];
-   if (!tsa->used) {
+   if (unlikely(!tsa->used)) {
netdev_err(tx_ring->netdev, "%s: unused sa_idx=%d\n",
   __func__, itd->sa_idx);
return 0;
-- 
2.7.4



[PATCH next-queue 1/2] ixgbe: fix clean hw loop count

2018-01-08 Thread Shannon Nelson
Fix a cut-paste error so that we can clean all the table entries.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 12c7132..57c10e6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -148,7 +148,7 @@ static void ixgbe_ipsec_clear_hw_tables(struct 
ixgbe_adapter *adapter)
ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0);
ixgbe_ipsec_set_rx_ip(hw, idx, (__be32 *)buf);
}
-   for (; idx < IXGBE_IPSEC_MAX_RX_IP_COUNT; idx++) {
+   for (; idx < IXGBE_IPSEC_MAX_SA_COUNT; idx++) {
ixgbe_ipsec_set_tx_sa(hw, idx, buf, 0);
ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0);
}
-- 
2.7.4



RE: [Intel-wired-lan] [PATCH 01/27] timecounter: Make cyclecounter struct part of timecounter struct

2018-01-08 Thread Brown, Aaron F
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Sagar Arun Kamble
> Sent: Thursday, December 14, 2017 11:38 PM
> To: linux-ker...@vger.kernel.org
> Cc: alsa-de...@alsa-project.org; linux-r...@vger.kernel.org;
> netdev@vger.kernel.org; Richard Cochran ;
> Stephen Boyd ; Chris Wilson  wilson.co.uk>; John Stultz ; intel-wired-
> l...@lists.osuosl.org; Thomas Gleixner ; Kamble, Sagar A
> ; kvm...@lists.cs.columbia.edu; linux-arm-
> ker...@lists.infradead.org
> Subject: [Intel-wired-lan] [PATCH 01/27] timecounter: Make cyclecounter
> struct part of timecounter struct
> 
> There is no real need for the users of timecounters to define cyclecounter
> and timecounter variables separately. Since timecounter will always be
> based on cyclecounter, have cyclecounter struct as member of timecounter
> struct.
> 
> v2: Rebase.
> 
> Suggested-by: Chris Wilson 
> Signed-off-by: Sagar Arun Kamble 
> Cc: Chris Wilson 
> Cc: Richard Cochran 
> Cc: John Stultz 
> Cc: Thomas Gleixner 
> Cc: Stephen Boyd 
> Cc: linux-ker...@vger.kernel.org
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: netdev@vger.kernel.org
> Cc: intel-wired-...@lists.osuosl.org
> Cc: linux-r...@vger.kernel.org
> Cc: alsa-de...@alsa-project.org
> Cc: kvm...@lists.cs.columbia.edu
> Acked-by: Jeff Kirsher  (Intel drivers)
> ---
>  arch/microblaze/kernel/timer.c | 20 ++--
>  drivers/clocksource/arm_arch_timer.c   | 19 ++--
>  drivers/net/ethernet/amd/xgbe/xgbe-dev.c   |  3 +-
>  drivers/net/ethernet/amd/xgbe/xgbe-ptp.c   |  9 +++---
>  drivers/net/ethernet/amd/xgbe/xgbe.h   |  1 -
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h|  1 -
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   | 20 ++--
>  drivers/net/ethernet/freescale/fec.h   |  1 -
>  drivers/net/ethernet/freescale/fec_ptp.c   | 30 +-
>  drivers/net/ethernet/intel/e1000e/e1000.h  |  1 -
>  drivers/net/ethernet/intel/e1000e/netdev.c | 27 
>  drivers/net/ethernet/intel/e1000e/ptp.c|  2 +-
>  drivers/net/ethernet/intel/igb/igb.h   |  1 -
>  drivers/net/ethernet/intel/igb/igb_ptp.c   | 25 ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h   |  1 -
>  drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c   | 17 +-
>  drivers/net/ethernet/mellanox/mlx4/en_clock.c  | 28 -
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  1 -
>  .../net/ethernet/mellanox/mlx5/core/lib/clock.c| 34 ++--
>  drivers/net/ethernet/qlogic/qede/qede_ptp.c| 20 ++--
>  drivers/net/ethernet/ti/cpts.c | 36 
> --
>  drivers/net/ethernet/ti/cpts.h |  1 -
>  include/linux/mlx5/driver.h|  1 -
>  include/linux/timecounter.h|  4 +--
>  include/sound/hdaudio.h|  1 -
>  kernel/time/timecounter.c  | 28 -
>  sound/hda/hdac_stream.c|  7 +++--
>  virt/kvm/arm/arch_timer.c  |  6 ++--
>  28 files changed, 163 insertions(+), 182 deletions(-)
> 

For Intel e1000e and igb drivers:
Tested-by: Aaron Brown 


Re: dvb usb issues since kernel 4.9

2018-01-08 Thread Jesper Dangaard Brouer
On Mon, 8 Jan 2018 22:44:27 +0100
Peter Zijlstra  wrote:

> On Mon, Jan 08, 2018 at 10:31:09PM +0100, Jesper Dangaard Brouer wrote:
> > I did expected the issue to get worse, when you load the Pi with
> > network traffic, as now the softirq time-budget have to be shared
> > between networking and USB/DVB. Thus, I guess you are running TCP and
> > USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...)  
> 
> Isn't networking also over USB on the Pi ?

Darn, that is true. Looking at the dmesg output in http://ix.io/DOg:

[0.405942] usbcore: registered new interface driver smsc95xx
[5.821104] smsc95xx 1-1.1:1.0 eth0: link up, 100Mbps, full-duplex, lpa 
0x45E1

I don't know enough about USB... is it possible to control which CPU
handles the individual USB ports, or on some other level (than ports)?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next] bpf: introduce BPF_JIT_ALWAYS_ON config

2018-01-08 Thread Daniel Borkmann
On 01/08/2018 04:35 AM, Alexei Starovoitov wrote:
> The BPF interpreter has been used as part of the spectre 2 attack 
> CVE-2017-5715.
> 
> A quote from goolge project zero blog:
> "At this point, it would normally be necessary to locate gadgets in
> the host kernel code that can be used to actually leak data by reading
> from an attacker-controlled location, shifting and masking the result
> appropriately and then using the result of that as offset to an
> attacker-controlled address for a load. But piecing gadgets together
> and figuring out which ones work in a speculation context seems annoying.
> So instead, we decided to use the eBPF interpreter, which is built into
> the host kernel - while there is no legitimate way to invoke it from inside
> a VM, the presence of the code in the host kernel's text section is sufficient
> to make it usable for the attack, just like with ordinary ROP gadgets."
> 
> To make attacker job harder introduce BPF_JIT_ALWAYS_ON config
> option that removes interpreter from the kernel in favor of JIT-only mode.
> So far eBPF JIT is supported by:
> x64, arm64, arm32, sparc64, s390, powerpc64, mips64
> 
> The start of JITed program is randomized and code page is marked as read-only.
> In addition "constant blinding" can be turned on with net.core.bpf_jit_harden
> 
> Signed-off-by: Alexei Starovoitov 
> ---
>  init/Kconfig   | 7 +++
>  kernel/bpf/core.c  | 9 +
>  kernel/bpf/verifier.c  | 4 
>  net/core/sysctl_net_core.c | 9 +
>  4 files changed, 29 insertions(+)
> 
> diff --git a/init/Kconfig b/init/Kconfig
> index 2934249fba46..5e2a4a391ba9 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1392,6 +1392,13 @@ config BPF_SYSCALL
> Enable the bpf() system call that allows to manipulate eBPF
> programs and maps via file descriptors.
>  
> +config BPF_JIT_ALWAYS_ON
> + bool "Permanently enable BPF JIT and remove BPF interpreter"
> + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
> + help
> +   Enables BPF JIT and removes BPF interpreter to avoid
> +   speculative execution of BPF instructions by the interpreter
> +
>  config USERFAULTFD
>   bool "Enable userfaultfd() system call"
>   select ANON_INODES
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 70a534549cd3..42756c434e0b 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -781,6 +781,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 
> r4, u64 r5)
>  }
>  EXPORT_SYMBOL_GPL(__bpf_call_base);
>  
> +#ifndef CONFIG_BPF_JIT_ALWAYS_ON
>  /**
>   *   __bpf_prog_run - run eBPF program on a given context
>   *   @ctx: is the data we are operating on
> @@ -1376,6 +1377,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 
> stack_depth)
>   __bpf_call_base_args;
>   insn->code = BPF_JMP | BPF_CALL_ARGS;
>  }
> +#endif
>  
>  bool bpf_prog_array_compatible(struct bpf_array *array,
>  const struct bpf_prog *fp)
> @@ -1427,9 +1429,11 @@ static int bpf_check_tail_call(const struct bpf_prog 
> *fp)
>   */
>  struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
>  {
> +#ifndef CONFIG_BPF_JIT_ALWAYS_ON
>   u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
>  
>   fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
> +#endif
>  
>   /* eBPF JITs can rewrite the program in case constant
>* blinding is active. However, in case of error during
> @@ -1453,6 +1457,11 @@ struct bpf_prog *bpf_prog_select_runtime(struct 
> bpf_prog *fp, int *err)
>*/
>   *err = bpf_check_tail_call(fp);
>  
> +#ifdef CONFIG_BPF_JIT_ALWAYS_ON
> + if (!fp->jited)
> + *err = -ENOTSUPP;
> +#endif

This part here and ...

>   return fp;
>  }
>  EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
[...]
> @@ -524,6 +530,9 @@ static __net_initdata struct pernet_operations 
> sysctl_core_ops = {
>  
>  static __init int sysctl_core_init(void)
>  {
> +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_JIT_ALWAYS_ON)
> + bpf_jit_enable = 1;
> +#endif

... this one will race and break stuff in the current shape, one example
is the PTP classifier in the tree: sysctl_core_init() is done in fs_initcall(),
whereas ptp_classifier_init() is done in sock_init() which is done out of
core_initcall().

So what will happen is that at this point in time bpf_jit_enable is not yet
set to 1, so when ptp_classifier_init() calls the cBPF bpf_prog_create(), it
will migrate the insns over to eBPF and in bpf_prog_select_runtime() called
from bpf_migrate_filter() have the assumption that we always succeed here
since when JIT fails, we will fall back to the interpreter anyway. The only
error up until now in bpf_prog_select_runtime() that could happen is out of
native eBPF prog load, so bpf_migrate_filter() will thus return just fine
and on first call to PTP classifier from a network packet, we'll get NULL
pointer deref since the fp->b

Re: dvb usb issues since kernel 4.9

2018-01-08 Thread Peter Zijlstra
On Mon, Jan 08, 2018 at 10:31:09PM +0100, Jesper Dangaard Brouer wrote:
> I did expected the issue to get worse, when you load the Pi with
> network traffic, as now the softirq time-budget have to be shared
> between networking and USB/DVB. Thus, I guess you are running TCP and
> USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...)

Isn't networking also over USB on the Pi ?


Re: dvb usb issues since kernel 4.9

2018-01-08 Thread Jesper Dangaard Brouer

On Mon, 8 Jan 2018 17:26:10 +0100
"Josef Griebichler"  wrote:

> I tried your mentioned patch but unfortunately no real improvement for me.
> dmesg http://ix.io/DOg
> tvheadend service log http://ix.io/DOi
>
> Errors during recording are still there.

Are you _also_ recording the stream on the Raspberry Pi?

It seems to me, that you are expecting too much from this small device.

> Errors increase if there is additional tcp load on raspberry.

I did expected the issue to get worse, when you load the Pi with
network traffic, as now the softirq time-budget have to be shared
between networking and USB/DVB. Thus, I guess you are running TCP and
USB/mpeg2ts on the same CPU (why when you have 4 CPUs?...)

If you expect/want to get stable performance out of such a small box,
then you (or LibreELEC) need to tune the box for this usage.  And it
does not have to be that complicated.  First step is to move IRQ
handling for the NIC to another CPU and than the USB port handling the
DVB signal (/proc/irq/*/smp_affinity_list).  And then pin the
userspace process (taskset) to another CPU than the one handling
USB-softirq.

> Unfortunately there's no usbmon or tshark on libreelec so I can't
> provide further logs.

Do you have perf or trace-cmd on the box?  Maybe we could come up with
some kernel functions to trace, to measure/show the latency spikes?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


[PATCH] wireless: broadcom: radio_2056: delete duplicated macro definitions

2018-01-08 Thread Rasmus Villemoes
Ctrl-V was hit twice when these macros were inserted:

$ sed -n '9,527p' ./drivers/net/wireless/broadcom/b43/radio_2056.h | md5sum
4db53450c59d9939e903d4e4ba6bc9b1  -
$ sed -n '528,1046p' ./drivers/net/wireless/broadcom/b43/radio_2056.h | md5sum
4db53450c59d9939e903d4e4ba6bc9b1  -

Signed-off-by: Rasmus Villemoes 
---
 drivers/net/wireless/broadcom/b43/radio_2056.h | 519 -
 1 file changed, 519 deletions(-)

diff --git a/drivers/net/wireless/broadcom/b43/radio_2056.h 
b/drivers/net/wireless/broadcom/b43/radio_2056.h
index 59297fdce5e3..779b80ea072f 100644
--- a/drivers/net/wireless/broadcom/b43/radio_2056.h
+++ b/drivers/net/wireless/broadcom/b43/radio_2056.h
@@ -525,525 +525,6 @@
 #define B2056_VCM_MASK 0x1C
 #define B2056_RSSI_VCM_SHIFT   0x02
 
-#define B2056_SYN  (0x0 << 12)
-#define B2056_TX0  (0x2 << 12)
-#define B2056_TX1  (0x3 << 12)
-#define B2056_RX0  (0x6 << 12)
-#define B2056_RX1  (0x7 << 12)
-#define B2056_ALLTX(0xE << 12)
-#define B2056_ALLRX(0xF << 12)
-
-#define B2056_SYN_RESERVED_ADDR0   0x00
-#define B2056_SYN_IDCODE   0x01
-#define B2056_SYN_RESERVED_ADDR2   0x02
-#define B2056_SYN_RESERVED_ADDR3   0x03
-#define B2056_SYN_RESERVED_ADDR4   0x04
-#define B2056_SYN_RESERVED_ADDR5   0x05
-#define B2056_SYN_RESERVED_ADDR6   0x06
-#define B2056_SYN_RESERVED_ADDR7   0x07
-#define B2056_SYN_COM_CTRL 0x08
-#define B2056_SYN_COM_PU   0x09
-#define B2056_SYN_COM_OVR  0x0A
-#define B2056_SYN_COM_RESET0x0B
-#define B2056_SYN_COM_RCAL 0x0C
-#define B2056_SYN_COM_RC_RXLPF 0x0D
-#define B2056_SYN_COM_RC_TXLPF 0x0E
-#define B2056_SYN_COM_RC_RXHPF 0x0F
-#define B2056_SYN_RESERVED_ADDR16  0x10
-#define B2056_SYN_RESERVED_ADDR17  0x11
-#define B2056_SYN_RESERVED_ADDR18  0x12
-#define B2056_SYN_RESERVED_ADDR19  0x13
-#define B2056_SYN_RESERVED_ADDR20  0x14
-#define B2056_SYN_RESERVED_ADDR21  0x15
-#define B2056_SYN_RESERVED_ADDR22  0x16
-#define B2056_SYN_RESERVED_ADDR23  0x17
-#define B2056_SYN_RESERVED_ADDR24  0x18
-#define B2056_SYN_RESERVED_ADDR25  0x19
-#define B2056_SYN_RESERVED_ADDR26  0x1A
-#define B2056_SYN_RESERVED_ADDR27  0x1B
-#define B2056_SYN_RESERVED_ADDR28  0x1C
-#define B2056_SYN_RESERVED_ADDR29  0x1D
-#define B2056_SYN_RESERVED_ADDR30  0x1E
-#define B2056_SYN_RESERVED_ADDR31  0x1F
-#define B2056_SYN_GPIO_MASTER1 0x20
-#define B2056_SYN_GPIO_MASTER2 0x21
-#define B2056_SYN_TOPBIAS_MASTER   0x22
-#define B2056_SYN_TOPBIAS_RCAL 0x23
-#define B2056_SYN_AFEREG   0x24
-#define B2056_SYN_TEMPPROCSENSE0x25
-#define B2056_SYN_TEMPPROCSENSEIDAC0x26
-#define B2056_SYN_TEMPPROCSENSERCAL0x27
-#define B2056_SYN_LPO  0x28
-#define B2056_SYN_VDDCAL_MASTER0x29
-#define B2056_SYN_VDDCAL_IDAC  0x2A
-#define B2056_SYN_VDDCAL_STATUS0x2B
-#define B2056_SYN_RCAL_MASTER  0x2C
-#define B2056_SYN_RCAL_CODE_OUT0x2D
-#define B2056_SYN_RCCAL_CTRL0  0x2E
-#define B2056_SYN_RCCAL_CTRL1  0x2F
-#define B2056_SYN_RCCAL_CTRL2  0x30
-#define B2056_SYN_RCCAL_CTRL3  0x31
-#define B2056_SYN_RCCAL_CTRL4  0x32
-#define B2056_SYN_RCCAL_CTRL5  0x33
-#define B2056_SYN_RCCAL_CTRL6  0x34
-#define B2056_SYN_RCCAL_CTRL7  0x35
-#define B2056_SYN_RCCAL_CTRL8  0x36
-#define B2056_SYN_RCCAL_CTRL9  0x37
-#define B2056_SYN_RCCAL_CTRL10 0x38
-#define B2056_SYN_RCCAL_CTRL11 0x39
-#define B2056_SYN_ZCAL_SPARE1  0x3A
-#define B2056_SYN_ZCAL_SPARE2  0x3B
-#define B2056_SYN_PLL_MAST10x3C
-#define B2056_SYN_PLL_MAST20x3D
-#define B2056_SYN_PLL_MAST30x3E
-#define B2056_SYN_PLL_BIAS_RESET   0x3F
-#define B2056_SYN_PLL_XTAL00x40
-#define B2056_SYN_PLL_XTAL10x41
-#define B2056_SYN_PLL_XTAL30x42
-#define B2056_SYN_PLL_XTAL40x43
-#define B2056_SYN_PLL_XTAL50x44
-#define B2056_SYN_PLL_XTAL60x45
-#define B2056_SYN_PLL_REFDIV   0x46
-#define B2056_SYN_PLL_PFD  0x47
-#define B2056_SYN_PLL_CP1  0x48
-#define B2056_SYN_PLL_CP2  0x49
-#define B2056_SYN_PLL_CP3  0x4A
-#define B2056_SYN_PLL_LOOPFILTER1  0x4B
-#define B2056_SYN_PLL_LOOPFILTER2  0x4C
-#define B2056_SYN_PLL_LOOPFILTER3  0x4D
-#define B2056_SYN_PLL_LOOPFILTER4  0x4E
-#define B2056_SYN_PLL_LOOPFILTER5  0x4F
-#define B2056_SYN_PLL_MMD1 0x50
-#define B2056_SYN_PLL_MMD2 0x51
-#define B2056_SYN_PLL_VCO1 0x52
-#define B2056_SYN_PLL_VCO2 0x53
-#define B2056_S

Re: [PATCH 02/18] Documentation: document nospec helpers

2018-01-08 Thread Jonathan Corbet
On Mon, 8 Jan 2018 17:09:59 +
Mark Rutland  wrote:

> > I have just a couple of overall comments.
> > 
> >  - It would be nice if the document were done in RST and placed in the
> >core-API manual, perhaps using kerneldoc comments for the macros
> >themselves.  It's already 99.9% RST now, so the changes required would
> >be minimal.  
> 
> Is there any quickstart guide to RST that you can recommend?

http://docutils.sourceforge.net/docs/user/rst/quickref.html works
reasonably well.  We have some info in the kernel documentation as well,
see http://static.lwn.net/kerneldoc/doc-guide/sphinx.html

Thanks,

jon


Re: [PATCH 06/18] x86, barrier: stop speculation for failed access_ok

2018-01-08 Thread Dan Williams
On Sat, Jan 6, 2018 at 5:20 PM, Linus Torvalds
 wrote:
> On Sat, Jan 6, 2018 at 3:31 PM, Dan Williams  wrote:
>>
>> I assume if we put this in uaccess_begin() we also need audit for
>> paths that use access_ok but don't do on to call uaccess_begin()? A
>> quick glance shows a few places where we are open coding the stac().
>> Perhaps land the lfence in stac() directly?
>
> Yeah, we should put it in uaccess_begin(), and in the actual user
> accessor helpers that do stac. Some of them probably should be changed
> to use uaccess_begin() instead while at it.
>
> One question for the CPU people: do we actually care and need to do
> this for things that might *write* to something? The speculative write
> obviously is killed, but does it perhaps bring in a cacheline even
> when killed?

As far as I understand a write could trigger a request-for-ownership
read for the target cacheline.

> Because maybe we don't need the lfence in put_user(), only in get_user()?

Even though writes can trigger reads, as far as I can see the write
needs to be dependent on the first out-of-bounds read:

if (x < max)
y = array1[x];
put_user(array2 + y, z);

...in other words that first read should be annotated with
nospec_array_ptr() making an lfence in put_user() or other writes
moot.

yp = nospec_array_ptr(array1, x, max);
if (yp)
y = *yp;
put_user(array2 + y, z);


[PATCH net 2/3] sctp: add a ceiling to optlen in some sockopts

2018-01-08 Thread Marcelo Ricardo Leitner
Hangbin Liu reported that some sockopt calls could cause the kernel to log
a warning on memory allocation failure if the user supplied a large optlen
value. That is because some of them called memdup_user() without a ceiling
on optlen, allowing it to try to allocate really large buffers.

This patch adds a ceiling by limiting optlen to the maximum allowed that
would still make sense for these sockopt.

Reported-by: Hangbin Liu 
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/socket.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 
54c046783a89e76c9909ee85c83e6be38ada41a7..022b94f11fd8ac0d3b839b16dfc14f86abf2324f
 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk,
 
if (optlen < sizeof(struct sctp_hmacalgo))
return -EINVAL;
+   optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) +
+SCTP_AUTH_NUM_HMACS * sizeof(u16));
 
hmacs = memdup_user(optval, optlen);
if (IS_ERR(hmacs))
@@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk,
 
if (optlen <= sizeof(struct sctp_authkey))
return -EINVAL;
+   /* authkey->sca_keylength is u16, so optlen can't be bigger than
+* this.
+*/
+   optlen = min_t(unsigned int, optlen, USHRT_MAX +
+sizeof(struct sctp_authkey));
 
authkey = memdup_user(optval, optlen);
if (IS_ERR(authkey))
@@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
 
if (optlen < sizeof(*params))
return -EINVAL;
+   /* srs_number_streams is u16, so optlen can't be bigger than this. */
+   optlen = min_t(unsigned int, optlen, USHRT_MAX +
+sizeof(__u16) * sizeof(*params));
 
params = memdup_user(optval, optlen);
if (IS_ERR(params))
-- 
2.14.3



  1   2   3   4   >