date:20171109

[PATCH net-next] cxgb4: collect vpd info directly from hardware

2017-11-09 Thread Rahul Lakkireddy

Collect vpd information directly from hardware instead of software
adapter context. Move EEPROM physical address to virtual address
translation logic to t4_hw.c and update relevant files.

Fixes: 6f92a6544f1a ("cxgb4: collect hardware misc dumps")
Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h  |  6 ++
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 77 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 33 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 29 
 5 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h 
b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index 239c43084e77..1de1d811fde3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -166,6 +166,12 @@ struct cudbg_mps_tcam {
u8 reserved[2];
 };
 
+#define CUDBG_VPD_PF_SIZE 0x800
+#define CUDBG_SCFG_VER_ADDR 0x06
+#define CUDBG_SCFG_VER_LEN 4
+#define CUDBG_VPD_VER_ADDR 0x18c7
+#define CUDBG_VPD_VER_LEN 2
+
 struct cudbg_vpd_data {
u8 sn[SERNUM_LEN + 1];
u8 bn[PN_LEN + 1];
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c 
b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index fe3a9ef0ec3f..32c9858da110 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -68,6 +68,22 @@ struct cudbg_entity_hdr *cudbg_get_entity_hdr(void *outbuf, 
int i)
(sizeof(struct cudbg_entity_hdr) * (i - 1)));
 }
 
+static int cudbg_read_vpd_reg(struct adapter *padap, u32 addr, u32 len,
+ void *dest)
+{
+   int vaddr, rc;
+
+   vaddr = t4_eeprom_ptov(addr, padap->pf, EEPROMPFSIZE);
+   if (vaddr < 0)
+   return vaddr;
+
+   rc = pci_read_vpd(padap->pdev, vaddr, len, dest);
+   if (rc < 0)
+   return rc;
+
+   return 0;
+}
+
 int cudbg_collect_reg_dump(struct cudbg_init *pdbg_init,
   struct cudbg_buffer *dbg_buff,
   struct cudbg_error *cudbg_err)
@@ -1289,8 +1305,47 @@ int cudbg_collect_vpd_data(struct cudbg_init *pdbg_init,
 {
struct adapter *padap = pdbg_init->adap;
struct cudbg_buffer temp_buff = { 0 };
+   char vpd_str[CUDBG_VPD_VER_LEN + 1];
+   u32 scfg_vers, vpd_vers, fw_vers;
struct cudbg_vpd_data *vpd_data;
-   int rc;
+   struct vpd_params vpd = { 0 };
+   int rc, ret;
+
+   rc = t4_get_raw_vpd_params(padap, );
+   if (rc)
+   return rc;
+
+   rc = t4_get_fw_version(padap, _vers);
+   if (rc)
+   return rc;
+
+   /* Serial Configuration Version is located beyond the PF's vpd size.
+* Temporarily give access to entire EEPROM to get it.
+*/
+   rc = pci_set_vpd_size(padap->pdev, EEPROMVSIZE);
+   if (rc < 0)
+   return rc;
+
+   ret = cudbg_read_vpd_reg(padap, CUDBG_SCFG_VER_ADDR, CUDBG_SCFG_VER_LEN,
+_vers);
+
+   /* Restore back to original PF's vpd size */
+   rc = pci_set_vpd_size(padap->pdev, CUDBG_VPD_PF_SIZE);
+   if (rc < 0)
+   return rc;
+
+   if (ret)
+   return ret;
+
+   rc = cudbg_read_vpd_reg(padap, CUDBG_VPD_VER_ADDR, CUDBG_VPD_VER_LEN,
+   vpd_str);
+   if (rc)
+   return rc;
+
+   vpd_str[CUDBG_VPD_VER_LEN] = '\0';
+   rc = kstrtouint(vpd_str, 0, _vers);
+   if (rc)
+   return rc;
 
rc = cudbg_get_buff(dbg_buff, sizeof(struct cudbg_vpd_data),
_buff);
@@ -1298,16 +1353,16 @@ int cudbg_collect_vpd_data(struct cudbg_init *pdbg_init,
return rc;
 
vpd_data = (struct cudbg_vpd_data *)temp_buff.data;
-   memcpy(vpd_data->sn, padap->params.vpd.sn, SERNUM_LEN + 1);
-   memcpy(vpd_data->bn, padap->params.vpd.pn, PN_LEN + 1);
-   memcpy(vpd_data->na, padap->params.vpd.na, MACADDR_LEN + 1);
-   memcpy(vpd_data->mn, padap->params.vpd.id, ID_LEN + 1);
-   vpd_data->scfg_vers = padap->params.scfg_vers;
-   vpd_data->vpd_vers = padap->params.vpd_vers;
-   vpd_data->fw_major = FW_HDR_FW_VER_MAJOR_G(padap->params.fw_vers);
-   vpd_data->fw_minor = FW_HDR_FW_VER_MINOR_G(padap->params.fw_vers);
-   vpd_data->fw_micro = FW_HDR_FW_VER_MICRO_G(padap->params.fw_vers);
-   vpd_data->fw_build = FW_HDR_FW_VER_BUILD_G(padap->params.fw_vers);
+   memcpy(vpd_data->sn, vpd.sn, SERNUM_LEN + 1);
+   memcpy(vpd_data->bn, vpd.pn, PN_LEN + 1);
+   memcpy(vpd_data->na, vpd.na, MACADDR_LEN + 1);
+   memcpy(vpd_data->mn, vpd.id, ID_LEN + 1);
+   vpd_data->scfg_vers = scfg_vers;
+

Re: [PATCH iproute2 2/2] devlink: add batch command support

2017-11-09 Thread Ivan Vecera

On 10.11.2017 07:57, Leon Romanovsky wrote:
> On Fri, Nov 10, 2017 at 07:20:14AM +0100, Ivan Vecera wrote:
>> The patch adds support to batch devlink commands.
>>
>> Cc: Jiri Pirko 
>> Cc: Arkadi Sharshevsky 
>> Signed-off-by: Ivan Vecera 
>> ---
>>  devlink/devlink.c  | 70 
>> +++---
>>  man/man8/devlink.8 | 16 +
>>  2 files changed, 78 insertions(+), 8 deletions(-)
>>
> 
> <..>
> 
>> diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
>> index a480766c..a975ef34 100644
>> --- a/man/man8/devlink.8
>> +++ b/man/man8/devlink.8
>> @@ -12,6 +12,12 @@ devlink \- Devlink tool
>>  .sp
>>
>>  .ti -8
>> +.B devlink
>> +.RB "[ " -force " ] "
>> +.BI "-batch " filename
>> +.sp
>> +
>> +.ti -8
>>  .IR OBJECT " := { "
>>  .BR dev " | " port " | " monitor " }"
>>  .sp
>> @@ -32,6 +38,16 @@ Print the version of the
>>  utility and exit.
>>
>>  .TP
>> +.BR "\-b", " \-batch " 
>> +Read commands from provided file or standard input and invoke them.
>> +First failure will cause termination of devlink.
> 
> It is worth to document the expected format of that file.
> And IMHO, it is better to have ability to load JSON fie which was
> generated by -j, instead of declaring new format/knob.
It's just a list of command-lines... like other utils (bridge,ip...)

I.



signature.asc
Description: OpenPGP digital signature

Re: [PATCH iproute2 2/2] devlink: add batch command support

2017-11-09 Thread Leon Romanovsky

On Fri, Nov 10, 2017 at 07:20:14AM +0100, Ivan Vecera wrote:
> The patch adds support to batch devlink commands.
>
> Cc: Jiri Pirko 
> Cc: Arkadi Sharshevsky 
> Signed-off-by: Ivan Vecera 
> ---
>  devlink/devlink.c  | 70 
> +++---
>  man/man8/devlink.8 | 16 +
>  2 files changed, 78 insertions(+), 8 deletions(-)
>

<..>

> diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
> index a480766c..a975ef34 100644
> --- a/man/man8/devlink.8
> +++ b/man/man8/devlink.8
> @@ -12,6 +12,12 @@ devlink \- Devlink tool
>  .sp
>
>  .ti -8
> +.B devlink
> +.RB "[ " -force " ] "
> +.BI "-batch " filename
> +.sp
> +
> +.ti -8
>  .IR OBJECT " := { "
>  .BR dev " | " port " | " monitor " }"
>  .sp
> @@ -32,6 +38,16 @@ Print the version of the
>  utility and exit.
>
>  .TP
> +.BR "\-b", " \-batch " 
> +Read commands from provided file or standard input and invoke them.
> +First failure will cause termination of devlink.

It is worth to document the expected format of that file.
And IMHO, it is better to have ability to load JSON fie which was
generated by -j, instead of declaring new format/knob.

> +
> +.TP
> +.BR "\-force"
> +Don't terminate devlink on errors in batch mode.
> +If there were any errors during execution of the commands, the application 
> return code will be non zero.
> +
> +.TP
>  .BR "\-n" , " --no-nice-names"
>  Turn off printing out nice names, for example netdevice ifnames instead of 
> devlink port identification.
>
> --
> 2.13.6
>


signature.asc
Description: PGP signature

Re: [net 3/6] net/mlx5: FPGA, return -EINVAL if size is zero

2017-11-09 Thread Saeed Mahameed

On Fri, 2017-11-10 at 15:23 +0900, Or Gerlitz wrote:
> On Fri, Nov 10, 2017 at 3:13 PM, Saeed Mahameed 
> wrote:
> > On Thu, 2017-11-09 at 18:12 +0900, Or Gerlitz wrote:
> > > On Thu, Nov 9, 2017 at 4:43 PM, Kamal Heib 
> > > wrote:
> > > > On Wed, 2017-11-08 at 23:13 +0900, Or Gerlitz wrote:
> > > > > On Wed, Nov 8, 2017 at 4:21 PM, Saeed Mahameed  > > > > ox.c
> > > > > om>
> > > > > wrote:
> > > > > > From: Kamal Heib 
> > > > > > 
> > > > > > In the current code, if a size of zero is passed to
> > > > > > mlx5_fpga_mem_{read|write}_i2c() functions the "err"
> > > > > 
> > > > > Don't we need to fix the call site where zero size is
> > > > > provided
> > > > > and
> > > > > not
> > > > > in called function?
> > > > > 
> > > > 
> > > > Isn't sending down a zero size a sign for a bug which we are
> > > > not
> > > > fixing?
> > > > > 
> > > > 
> > > > Both functions are called from an exported symbols. so I think
> > > > the
> > > > size
> > > > validation should be within this two functions just like the
> > > > case
> > > > of
> > > > checking that mdev isn't set.
> > > 
> > > mmm, I see exported to who exactly? how are they being called, by
> > > func pointer?
> > > can you point to the call sites?
> > 
> > Or, are you ok with this patch ? I would like to post V2 with the
> > reviewed-by tag fix.
> 
> The RB tag issue was on another patch.. for this patch I realized
> after talking
> to the author that it comes to fix a build warning. I would be happy
> if we can clarify
> that in the change log.


Ok I will drop this patch until the author provides the missing
information.

Re: [net 3/6] net/mlx5: FPGA, return -EINVAL if size is zero

2017-11-09 Thread Or Gerlitz

On Fri, Nov 10, 2017 at 3:13 PM, Saeed Mahameed  wrote:
> On Thu, 2017-11-09 at 18:12 +0900, Or Gerlitz wrote:
>> On Thu, Nov 9, 2017 at 4:43 PM, Kamal Heib 
>> wrote:
>> > On Wed, 2017-11-08 at 23:13 +0900, Or Gerlitz wrote:
>> > > On Wed, Nov 8, 2017 at 4:21 PM, Saeed Mahameed > > > om>
>> > > wrote:
>> > > > From: Kamal Heib 
>> > > >
>> > > > In the current code, if a size of zero is passed to
>> > > > mlx5_fpga_mem_{read|write}_i2c() functions the "err"
>> > >
>> > > Don't we need to fix the call site where zero size is provided
>> > > and
>> > > not
>> > > in called function?
>> > >
>> >
>> > Isn't sending down a zero size a sign for a bug which we are not
>> > fixing?
>> > >
>> >
>> > Both functions are called from an exported symbols. so I think the
>> > size
>> > validation should be within this two functions just like the case
>> > of
>> > checking that mdev isn't set.
>>
>> mmm, I see exported to who exactly? how are they being called, by
>> func pointer?
>> can you point to the call sites?
>
> Or, are you ok with this patch ? I would like to post V2 with the
> reviewed-by tag fix.

The RB tag issue was on another patch.. for this patch I realized after talking
to the author that it comes to fix a build warning. I would be happy
if we can clarify
that in the change log.

Re: [PATCH 30/31] dt-bindings: nds32 CPU Bindings

2017-11-09 Thread Greentime Hu

2017-11-09 21:57 GMT+08:00 Rob Herring :
> On Thu, Nov 9, 2017 at 3:39 AM, Greentime Hu  wrote:
>> 2017-11-08 21:18 GMT+08:00 Rob Herring :
>>> Please Cc the DT list on bindings.
>>
>> Sorry. I am not sure what you mean.
>> Do you mean add devicet...@vger.kernel.org to cc list?
>
> Yes. Use get_maintainers.pl as a guide.

Roger that! Thanks!

>>> On Tue, Nov 7, 2017 at 11:55 PM, Greentime Hu  wrote:
 From: Greentime Hu 
>>>
>
 +   device_type = "cpu";
 +   compatible = "andestech,n13", "andestech,n15";
>>>
>>> n13 is a superset of n15?
>>
>> No, they are independent ones.
>
> Then having both is not valid. The strings should be in order of best
> match to worst match where worst match is typically either older
> implementations of IP blocks or generic'ish strings such as "ns16550"
> for a UART.

Thanks.
I would like to explain it more clearly.
They are independent ones in implementations.
They are implemented based on the same nds32 ISA and architecture spec
with different configurations
like cache size, page size, cache type(VIPT/PIPT), pipeline stages...
Most of them are compatible.
They use the same toolchain to build vmlinux which can run on
different nds32 cores.

[PATCH iproute2 0/2] add batch command support to devlink

2017-11-09 Thread Ivan Vecera

This patch series adds support for devlink commands batching. The first
just removes a requirement to have declared 'resolve_hosts' variable in
any command that use any function implemented in utils.c (it is really
confusing to see this declaration in utils like bridge or devlink).

Ivan Vecera (2):
  lib: make resolve_hosts variable common
  devlink: add batch command support

 bridge/bridge.c|  1 -
 devlink/devlink.c  | 70 +++---
 genl/genl.c|  1 -
 ip/ip.c|  1 -
 ip/rtmon.c |  1 -
 lib/utils.c|  1 +
 man/man8/devlink.8 | 16 +
 misc/arpd.c|  2 --
 misc/ss.c  |  1 -
 tc/tc.c|  1 -
 10 files changed, 79 insertions(+), 16 deletions(-)

-- 
2.13.6

[PATCH iproute2 1/2] lib: make resolve_hosts variable common

2017-11-09 Thread Ivan Vecera

Any iproute utility that uses any function from lib/utils.c needs
to declare its own resolve_hosts variable instance although it does
not need/use hostname resolving functionality (currently only 'ip'
and 'ss' commands uses this).
The patch declares single common instance of resolve_hosts directly
in utils.c so the existing ones can be removed (the same approach
that is used for timestamp_short).

Cc: Jiri Pirko 
Cc: Arkadi Sharshevsky 
Signed-off-by: Ivan Vecera 
---
 bridge/bridge.c | 1 -
 genl/genl.c | 1 -
 ip/ip.c | 1 -
 ip/rtmon.c  | 1 -
 lib/utils.c | 1 +
 misc/arpd.c | 2 --
 misc/ss.c   | 1 -
 tc/tc.c | 1 -
 8 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/bridge/bridge.c b/bridge/bridge.c
index 5ff038d6..6658cb8f 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -18,7 +18,6 @@
 
 struct rtnl_handle rth = { .fd = -1 };
 int preferred_family = AF_UNSPEC;
-int resolve_hosts;
 int oneline;
 int show_stats;
 int show_details;
diff --git a/genl/genl.c b/genl/genl.c
index 747074b0..7e4a208d 100644
--- a/genl/genl.c
+++ b/genl/genl.c
@@ -30,7 +30,6 @@
 int show_stats = 0;
 int show_details = 0;
 int show_raw = 0;
-int resolve_hosts = 0;
 
 static void *BODY;
 static struct genl_util * genl_list;
diff --git a/ip/ip.c b/ip/ip.c
index e66f6970..e2da46dd 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -30,7 +30,6 @@ int human_readable;
 int use_iec;
 int show_stats;
 int show_details;
-int resolve_hosts;
 int oneline;
 int brief;
 int json;
diff --git a/ip/rtmon.c b/ip/rtmon.c
index 1c2981f7..94baa38e 100644
--- a/ip/rtmon.c
+++ b/ip/rtmon.c
@@ -25,7 +25,6 @@
 #include "utils.h"
 #include "libnetlink.h"
 
-int resolve_hosts;
 static int init_phase = 1;
 
 static void write_stamp(FILE *fp)
diff --git a/lib/utils.c b/lib/utils.c
index ac155bf5..f77be1fd 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -37,6 +37,7 @@
 #include "utils.h"
 #include "namespace.h"
 
+int resolve_hosts;
 int timestamp_short;
 
 int get_hex(char c)
diff --git a/misc/arpd.c b/misc/arpd.c
index c2666f76..67d86b67 100644
--- a/misc/arpd.c
+++ b/misc/arpd.c
@@ -38,8 +38,6 @@
 #include "utils.h"
 #include "rt_names.h"
 
-int resolve_hosts;
-
 DB *dbase;
 char   *dbname = "/var/lib/arpd/arpd.db";
 
diff --git a/misc/ss.c b/misc/ss.c
index 56a9ad41..45a0c330 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -88,7 +88,6 @@ static int security_get_initial_context(char *name,  char 
**context)
 }
 #endif
 
-int resolve_hosts;
 int resolve_services = 1;
 int preferred_family = AF_UNSPEC;
 int show_options;
diff --git a/tc/tc.c b/tc/tc.c
index 8e64a82b..32924164 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -39,7 +39,6 @@ int show_graph;
 int timestamp;
 
 int batch_mode;
-int resolve_hosts;
 int use_iec;
 int force;
 bool use_names;
-- 
2.13.6

[PATCH iproute2 2/2] devlink: add batch command support

2017-11-09 Thread Ivan Vecera

The patch adds support to batch devlink commands.

Cc: Jiri Pirko 
Cc: Arkadi Sharshevsky 
Signed-off-by: Ivan Vecera 
---
 devlink/devlink.c  | 70 +++---
 man/man8/devlink.8 | 16 +
 2 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 39cda067..1b15eef8 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -3803,12 +3803,16 @@ static int cmd_dpipe(struct dl *dl)
 static void help(void)
 {
pr_err("Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }\n"
+  "   devlink [ -f[orce] ] -b[atch] filename\n"
   "where  OBJECT := { dev | port | sb | monitor | dpipe }\n"
   "   OPTIONS := { -V[ersion] | -n[no-nice-names] | -j[json] | 
-p[pretty] | -v[verbose] }\n");
 }
 
-static int dl_cmd(struct dl *dl)
+static int dl_cmd(struct dl *dl, int argc, char **argv)
 {
+   dl->argc = argc;
+   dl->argv = argv;
+
if (dl_argv_match(dl, "help") || dl_no_arg(dl)) {
help();
return 0;
@@ -3832,13 +3836,10 @@ static int dl_cmd(struct dl *dl)
return -ENOENT;
 }
 
-static int dl_init(struct dl *dl, int argc, char **argv)
+static int dl_init(struct dl *dl)
 {
int err;
 
-   dl->argc = argc;
-   dl->argv = argv;
-
dl->nlg = mnlg_socket_open(DEVLINK_GENL_NAME, DEVLINK_GENL_VERSION);
if (!dl->nlg) {
pr_err("Failed to connect to devlink Netlink\n");
@@ -3890,16 +3891,59 @@ static void dl_free(struct dl *dl)
free(dl);
 }
 
+static int dl_batch(struct dl *dl, const char *name, bool force)
+{
+   char *line = NULL;
+   size_t len = 0;
+   int ret = EXIT_SUCCESS;
+
+   if (name && strcmp(name, "-") != 0) {
+   if (freopen(name, "r", stdin) == NULL) {
+   fprintf(stderr,
+   "Cannot open file \"%s\" for reading: %s\n",
+   name, strerror(errno));
+   return EXIT_FAILURE;
+   }
+   }
+
+   cmdlineno = 0;
+   while (getcmdline(, , stdin) != -1) {
+   char *largv[100];
+   int largc;
+
+   largc = makeargs(line, largv, 100);
+   if (!largc)
+   continue;   /* blank line */
+
+   if (dl_cmd(dl, largc, largv)) {
+   fprintf(stderr, "Command failed %s:%d\n",
+   name, cmdlineno);
+   ret = EXIT_FAILURE;
+   if (!force)
+   break;
+   }
+   }
+
+   if (line)
+   free(line);
+
+   return ret;
+}
+
 int main(int argc, char **argv)
 {
static const struct option long_options[] = {
{ "Version",no_argument,NULL, 'V' },
+   { "force",  no_argument,NULL, 'f' },
+   { "batch",  required_argument,  NULL, 'b' },
{ "no-nice-names",  no_argument,NULL, 'n' },
{ "json",   no_argument,NULL, 'j' },
{ "pretty", no_argument,NULL, 'p' },
{ "verbose",no_argument,NULL, 'v' },
{ NULL, 0, NULL, 0 }
};
+   const char *batch_file = NULL;
+   bool force = false;
struct dl *dl;
int opt;
int err;
@@ -3911,7 +3955,7 @@ int main(int argc, char **argv)
return EXIT_FAILURE;
}
 
-   while ((opt = getopt_long(argc, argv, "Vnjpv",
+   while ((opt = getopt_long(argc, argv, "Vfb:njpv",
  long_options, NULL)) >= 0) {
 
switch (opt) {
@@ -3919,6 +3963,12 @@ int main(int argc, char **argv)
printf("devlink utility, iproute2-ss%s\n", SNAPSHOT);
ret = EXIT_SUCCESS;
goto dl_free;
+   case 'f':
+   force = true;
+   break;
+   case 'b':
+   batch_file = optarg;
+   break;
case 'n':
dl->no_nice_names = true;
break;
@@ -3942,13 +3992,17 @@ int main(int argc, char **argv)
argc -= optind;
argv += optind;
 
-   err = dl_init(dl, argc, argv);
+   err = dl_init(dl);
if (err) {
ret = EXIT_FAILURE;
goto dl_free;
}
 
-   err = dl_cmd(dl);
+   if (batch_file)
+   err = dl_batch(dl, batch_file, force);
+   else
+   err = dl_cmd(dl, argc, argv);
+
if (err) {
ret = EXIT_FAILURE;
goto dl_fini;
diff --git

Re: [net 3/6] net/mlx5: FPGA, return -EINVAL if size is zero

2017-11-09 Thread Saeed Mahameed

On Thu, 2017-11-09 at 18:12 +0900, Or Gerlitz wrote:
> On Thu, Nov 9, 2017 at 4:43 PM, Kamal Heib 
> wrote:
> > On Wed, 2017-11-08 at 23:13 +0900, Or Gerlitz wrote:
> > > On Wed, Nov 8, 2017 at 4:21 PM, Saeed Mahameed  > > om>
> > > wrote:
> > > > From: Kamal Heib 
> > > > 
> > > > In the current code, if a size of zero is passed to
> > > > mlx5_fpga_mem_{read|write}_i2c() functions the "err"
> > > 
> > > Don't we need to fix the call site where zero size is provided
> > > and
> > > not
> > > in called function?
> > > 
> > 
> > Isn't sending down a zero size a sign for a bug which we are not
> > fixing?
> > > 
> > 
> > Both functions are called from an exported symbols. so I think the
> > size
> > validation should be within this two functions just like the case
> > of
> > checking that mdev isn't set.
> 
> mmm, I see exported to who exactly? how are they being called, by
> func pointer?
> can you point to the call sites?

Or, are you ok with this patch ? I would like to post V2 with the
reviewed-by tag fix.

Re: [PATCH net-next] sfc: don't warn on successful change of MAC

2017-11-09 Thread David Miller

From: Bert Kenward 
Date: Tue, 7 Nov 2017 17:30:30 +

> From: Robert Stonehouse 
> 
> Fixes: 535a61777f44e ("sfc: suppress handled MCDI failures when changing the 
> MAC address")
> Signed-off-by: Bert Kenward 

Applied, thank you.

Re: regression: UFO removal breaks kvm live migration

2017-11-09 Thread David Miller

From: Willem de Bruijn 
Date: Fri, 10 Nov 2017 14:32:29 +0900

> Okay, I will send a patch to reinstate UFO for this use case (only).

Thank you.

Re: [RFD] Managed interrupt affinities [ Was: mlx5 broken affinity ]

2017-11-09 Thread Saeed Mahameed

On Thu, 2017-11-09 at 22:42 +0100, Thomas Gleixner wrote:
> Find below a summary of the technical details, implications and
> options
> 
> What can be done for 4.14?
> 
>   We basically have two options: Revert at the driver level or ship
> as
>   is.
> 

I think we all came to the consensus that this is the only immediate
action to solve the mlx5 regression, So i am going to revert the driver
level change.

>   Even if we come up with a quick and dirty hack then it will be too
> late
>   for proper testing before sunday.
> 
> 
> What can be done with some time to work on?
> 
> The managed mechanism consists of 3 pieces:
> 
>  1) Vector spreading
> 
>  2) Managed vector allocation, which becomes a guaranteed reservation
> in
> 4.15 due of the big rework of the vector management code.
> 
> Non managed interrupts get a best effort reservation to handle
> theCPU
> unplug vector pressure problem in a sane way.
> 
>  3) CPU hotplug management
> 
> If the last CPU in the affinity set goes offline, then the
> interrupt is
> shutdown and restarted when the first CPU in the affinity set
> comes
> online again. The driver code needs to ensure that the queue
> associated
> to that interrupt is drained before shutdown and nothing is
> queued
> there after this point.
> 

Well, I can speak for mlx5 case or most of the network drivers, where
all of the queues associated with an interrupt, move with it, so i
don't think our current driver have this issue. I don't believe there
are network driver with fixed Per cpu resources, but it worth double
checking.

Regarding the below solutions, any one that will gurantee the initial
managed spreading and still allow the user to modify affinity via
/proc/irq/xyz/smp_afinity will be acceptable, since many tools and user
rely on this sysfs entry e.g. (irqbalance)

Thank you Thomas for handling and all the detailed information.
-Saeed.

> So we have options:
> 
> 1) Initial vector spreading 
> 
>  Let the driver use the initial vector spreading. That does only the
>  initial affinity setup, but otherwise the interrupts are handled
> like any
>  other non managed interrupt, i.e. best effort reservation, affinity
>  settings enabled and CPU unplug breaks affinity and moves them to
> some
>  random other online CPU.
> 
>  The simplest solution of all.
> 
> 2) Allowing a driver supplied mask
> 
>  Certainly simple to do, but as you said it's not really a solution.
> I'm
>  not sure whether we want to go there as this is going to be replaced
> fast
>  enough and then create another breakage/frustration level.
> 
> 
> 3) Affinity override in managed mode
> 
>  Doable, but there are a couple of things to think about:
> 
>   * How is this enabled?
> 
> - Opt-in by driver
>
> - Extra sysfs/procfs knob
> 
> We definitely should not enable it per default because that would
> surprise users/drivers which work with the current managed
> devices and
> rely on the affinity files to be non writeable in managed mode.
> 
>   * Is it allowed to set the affinity to offline, but present CPUs?
> 
>  In principle yes, because the core management code can do that
> as well
>  at setup time.
> 
>   * The affinity setting must fail when it cannot do a guaranteed
> reservation on the new target CPU(s).
> 
>  This is not much of a question. That's a matter of fact because
>  otherwise the association cannot be guaranteed and things fall
> apart
>  all over the place.
> 
>   * When and how is the driver informed about the change?
> 
>  When:
> 
>#1 Before the core tries to move the interrupt so it can veto
> the
> move if it cannot allocate new resources or whatever is
> required
> to operate after the move.
> 
>#2 After the core made the move effective because:
> 
>   - The interrupt might be moved from an offline set to an
> online
> set and needs to be started up, so the related queue must
> be
> enabled as well.
> 
>   - The interrupt might be moved from an online set to an
> offline
> set, so the queue needs to be drained and disabled.
> 
> - Resources which have been allocated in the first step must
> be
> made effective and old resources freed.
> 
>  How:
> 
>The existing affinity notification mechanism does not work for
> this
>and it's a horrible piece of crap which should go away sooner
> than
>later.
> 
>So we need some sensible way to provide callback. Emphasis on
>callbacks as one multiplexing callback is not a good idea.
> 
>   * How can the change made effective?
> 
> When the preliminaries (vector reservation on the new set and
> evtl. resource allocation in the subsystem have been done, then
> the
> actual move can be made.
> 
> But, there is a caveat. x86 is not good in reassociating
> interrupts on
> the fly except when it sits

Re: [PATCH net] rds: ib: Fix NULL pointer dereference in debug code

2017-11-09 Thread David Miller

From: Håkon Bugge 
Date: Tue,  7 Nov 2017 16:33:34 +0100

> rds_ib_recv_refill() is a function that refills an IB receive
> queue. It can be called from both the CQE handler (tasklet) and a
> worker thread.
> 
> Just after the call to ib_post_recv(), a debug message is printed with
> rdsdebug():
> 
> ret = ib_post_recv(ic->i_cm_id->qp, >r_wr, _wr);
> rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
>  recv->r_ibinc, sg_page(>r_frag->f_sg),
>  (long) ib_sg_dma_address(
> ic->i_cm_id->device,
> >r_frag->f_sg),
> ret);
> 
> Now consider an invocation of rds_ib_recv_refill() from the worker
> thread, which is preemptible. Further, assume that the worker thread
> is preempted between the ib_post_recv() and rdsdebug() statements.
> 
> Then, if the preemption is due to a receive CQE event, the
> rds_ib_recv_cqe_handler() will be invoked. This function processes
> receive completions, including freeing up data structures, such as the
> recv->r_frag.
> 
> In this scenario, rds_ib_recv_cqe_handler() will process the receive
> WR posted above. That implies, that the recv->r_frag has been freed
> before the above rdsdebug() statement has been executed. When it is
> later executed, we will have a NULL pointer dereference:
 ...
> This bug was provoked by compiling rds out-of-tree with
> EXTRA_CFLAGS="-DRDS_DEBUG -DDEBUG" and inserting an artificial delay
> between the rdsdebug() and ib_ib_port_recv() statements:
> 
>  /* XXX when can this fail? */
>  ret = ib_post_recv(ic->i_cm_id->qp, >r_wr, _wr);
> + if (can_wait)
> + usleep_range(1000, 5000);
>  rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
>   recv->r_ibinc, sg_page(>r_frag->f_sg),
>   (long) ib_sg_dma_address(
> 
> The fix is simply to move the rdsdebug() statement up before the
> ib_post_recv() and remove the printing of ret, which is taken care of
> anyway by the non-debug code.
> 
> Signed-off-by: Håkon Bugge 
> Reviewed-by: Knut Omang 
> Reviewed-by: Wei Lin Guay 

Applied, thank you.

Re: [PATCH net-next v7 0/3] nfp act_vlan: Rewrite of the TC vlan action to use the RCU, and incorporated review comments

2017-11-09 Thread David Miller

From: Manish Kurup 
Date: Tue,  7 Nov 2017 08:50:00 -0500

> This commit consists of 3 patches:
> 
> patch1 (1/3):
> The VLAN action maintains one set of stats across all cores, and uses a
> spinlock to synchronize updates to it from the same. Changed this to use a
> per-CPU stats context instead.
> This change will result in better performance.
> 
> patch2 (2/3):
> Modified netronome nfp flower action to use VLAN helper functions instead
> of accessing/referencing TC act_vlan private structures directly. 
> 
> patch3 (3/3):
> Using a spinlock in the VLAN action causes performance issues when the VLAN
> action is used on multiple cores. Rewrote the VLAN action to use RCU read
> locking for reads and updates instead.
> All functions now use an RCU dereferenced pointer to access the VLAN action
> context. Modified helper functions used by other modules, to use the RCU as
> opposed to directly accessing the structure.
> 
> As part of this review, there were some changes suggested by reviewers.
> I have incorporated all the changes that were requested.
 ...

Series applied, thank you.

Re: [PATCH net-next 0/2] ip_gre: add support for i/o_flags update

2017-11-09 Thread David Miller

From: Xin Long 
Date: Tue,  7 Nov 2017 16:33:07 +0800

> ip_gre is using as many ip_tunnel apis as possible, newlink works
> fine as gre would do it's own part in .ndo_init. But when changing
> link, ip_tunnel_changelink doesn't even update i/o_flags, and also
> the update of these flags would cause some other gre's properties
> need to be updated or recalculated.
> 
> These two patch are to add i/o_flags update and then do adjustment
> on some gre's properties according to the new i/o_flags.

Series applied, thank you.

Re: [PATCH net-next 0/2] net: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem

2017-11-09 Thread David Miller

From: Eric Dumazet 
Date: Tue,  7 Nov 2017 00:29:26 -0800

> We need to get per netns sysctl for sysctl_[proto]_rmem and
> sysctl_[proto]_wmem
> 
> This patch series adds the basic infrastructure allowing per proto
> conversion, and takes care of TCP.

Series applied, thanks Eric.

[PATCHv2 0/2] capability controlled user-namespaces

2017-11-09 Thread Mahesh Bandewar

From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 114 insertions(+)

-- 
2.15.0.448.gf294e3d99a-goog

[PATCHv2 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread Mahesh Bandewar

From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar 
---
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap_to_u32array(tmp.cap, _KERNEL_CAPABILITY_U32S,
+

[PATCHv2 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread Mahesh Bandewar

From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar 
---
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 38 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..a1fd9e460379 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3fe714da7f5a..647f825c7b5f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -23,6 +23,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -103,6 +104,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -161,6 +172,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..600c7dcb9ff7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -139,6 +139,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index fc46f5b85251..89103f16ac37 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPERM and no need
+* to check the user-ns hierarchy.
+*/
+   if

Re: regression: UFO removal breaks kvm live migration

2017-11-09 Thread Willem de Bruijn

On Wed, Nov 8, 2017 at 9:53 PM, Jason Wang  wrote:
>
>
> On 2017年11月08日 20:32, David Miller wrote:
>>
>> From: Jason Wang 
>> Date: Wed, 8 Nov 2017 17:25:48 +0900
>>
>>> On 2017年11月08日 17:08, Willem de Bruijn wrote:

 That won't help in the short term. I'm still reading up to see if
 there are
 any other options besides reimplement or advertise-but-drop, such as
 an implicit trigger that would make the guest renegotiate. It's
 unlikely, but
 worth a look..
>>>
>>> Yes, this looks hard. And even if we can manage to do this, it looks
>>> an overkill since it will impact all guest after migration.
>>
>> Like Willem I would much prefer "advertise-but-drop" if it works.
>
>
> This makes migration work but all guest UFO traffic will stall.
>
>>
>> In the long term feature renegotiation triggers are a must.
>>
>> There is no way for us to remove features otherwise.
>
>
> We can remove if we don't break userspace(guest).
>
>> In my opinion
>> this will even make migrations more powerful.
>
>
> But this does not help for guest running old version of kernel which still
> think UFO work.

Indeed, if we have to support live migration of arbitrary old guests
without any expectations on hypervisor version either, features can
simply never be reverted, even if a negotiation interface exists.

At least for upcoming features and devices, guest code should not
have this expectation, but from the start allow renegation such as
CTRL_GUEST_OFFLOADS [1] based on a host trigger. But for
tuntap TUNSETOFFLOAD it seems that ship has sailed.

Okay, I will send a patch to reinstate UFO for this use case (only). There
is some related work in tap_handle_frame and packet_direct_xmit to
segment directly in the device. I will be traveling the next few days, so
it won't be in time for 4.14 (but can go in stable later, of course).

[1] https://patchwork.kernel.org/patch/9850785/

Re: [PATCH] net: dsa: Don't add vlans when vlan filtering is disabled

2017-11-09 Thread David Miller

From: Andrew Lunn 
Date: Tue,  7 Nov 2017 00:04:24 +0100

> The software bridge can be build with vlan filtering support
> included. However, by default it is turned off. In its turned off
> state, it still passes VLANs via switchev, even though they are not to
> be used. Don't pass these VLANs to the hardware. Only do so when vlan
> filtering is enabled.
> 
> This fixes at least one corner case. There are still issues in other
> corners, such as when vlan_filtering is later enabled.
> 
> Signed-off-by: Andrew Lunn 

Applied to net-next, thanks Andrew.

Re: [kernel-hardening] Re: [PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 1:46 PM, Serge E. Hallyn  wrote:
> Quoting Eric W. Biederman (ebied...@xmission.com):
>> single sandbox.  I am not at all certain that the capabilities is the
>> proper place to limit code reachability.
>
> Right, I keep having this gut feeling that there is another way we
> should be doing that.  Maybe based on ksplice or perf, or maybe more
> based on subsystems.  And I hope someone pursues that.  But I can't put
> my finger on it, and meanwhile the capability checks obviously *are* in
> fact gates...
>
Well, I don't mind if there is a better solution available. The
proposed solution is not adding too much or complex code and using a
bit and a sysctl and will be sitting dormant. When we have complete
solution, this addition should not be a burden to maintain because of
it's non-invasive footprint.

I will push the next version of the patch-set that implements Serge's finding.

Thanks,
--mahesh..

[PS: I'll be soon traveling again and moving to an area where
connectivity will be scarce / unreliable. So please expect lot more
delays in my responses.]

> -serge

Re: [PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 1:30 PM, Serge E. Hallyn  wrote:
> Quoting Mahesh Bandewar (महेश बंडेवार) (mahe...@google.com):
> ...
>> >>
>> >>  ==
>> >>
>> >> +controlled_userns_caps_whitelist
>> >> +
>> >> +Capability mask that is whitelisted for "controlled" user namespaces.
>> >> +Any capability that is missing from this mask will not be allowed to
>> >> +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
>> >> +is not part of this mask, then processes running inside any controlled
>> >> +userns's will not be allowed to perform action that needs CAP_NET_RAW
>> >> +capability. However, processes that are attached to a parent user-ns
>> >> +hierarchy that is *not* controlled and has CAP_NET_RAW can continue
>> >> +performing those actions. User-namespaces are marked "controlled" at
>> >> +the time of their creation based on the capabilities of the creator.
>> >> +A process that does not have CAP_SYS_ADMIN will create user-namespaces
>> >> +that are controlled.
>> >
>> > Hm.  I think that's fine (the way 'controlled' user namespaces are
>> > defined), but that is design decision in itself, and should perhaps be
>> > discussed.
>> >
>> > Did you consider other ways?  What about using CAP_SETPCAP?
>> >
>> I did try other ways e.g. using another bounding-set etc. but
>> eventually settled with this approach because of main two properties -
>
> No, I meant did you try other ways of defining a controlled user
> namespace, other than one which is created by a task lacking
> CAP_SYS_ADMIN?
>
SYS_ADMIN is the capability that has been used for deciding who can or
cannot create namespaces, so didn't want to create another model that
may not be compatible with current model which is well understood
hence no.

> ...
>
>> >> +The value is expressed as two comma separated hex words (u32). This
>> >
>> > Why comma separated?  whitespace ok?  Leading 0x ok?  What is the
>> > default at boot?  (Obviously the patch tells me, I'm asking for it
>> > to be spelled out in the doc)
>> >
>> I tried multiple ways including representing capabilities in
>> string/name form for better readability but didn't want to add
>> additional complexities of dealing with strings and possible
>> string-related-issues for this. Also didn't want to reinvent the new
>> form so settled with something that is widely used (cpu
>> bounding/affinity/irq mapping etc.) and is capable of handling growing
>> bit set (currently 37 but possibly more later).
>
> Ok, thanks.

Re: [PATCH] tcp: Export to userspace the TCP state names for the trace events

2017-11-09 Thread Yafang Shao

2017-11-10 8:57 GMT+08:00 Steven Rostedt :
>
> From: "Steven Rostedt (VMware)" 
>
> The TCP trace events (specifically tcp_set_state), maps emums to symbol
> names via __print_symbolic(). But this only works for reading trace events
> from the tracefs trace files. If perf or trace-cmd were to record these
> events, the event format file does not convert the enum names into numbers,
> and you get something like:
>
> __print_symbolic(REC->oldstate,
> { TCP_ESTABLISHED, "TCP_ESTABLISHED" },
> { TCP_SYN_SENT, "TCP_SYN_SENT" },
> { TCP_SYN_RECV, "TCP_SYN_RECV" },
> { TCP_FIN_WAIT1, "TCP_FIN_WAIT1" },
> { TCP_FIN_WAIT2, "TCP_FIN_WAIT2" },
> { TCP_TIME_WAIT, "TCP_TIME_WAIT" },
> { TCP_CLOSE, "TCP_CLOSE" },
> { TCP_CLOSE_WAIT, "TCP_CLOSE_WAIT" },
> { TCP_LAST_ACK, "TCP_LAST_ACK" },
> { TCP_LISTEN, "TCP_LISTEN" },
> { TCP_CLOSING, "TCP_CLOSING" },
> { TCP_NEW_SYN_RECV, "TCP_NEW_SYN_RECV" })
>
> Where trace-cmd and perf do not know the values of those enums.
>
> Use the TRACE_DEFINE_ENUM() macros that will have the trace events convert
> the enum strings into their values at system boot. This will allow perf and
> trace-cmd to see actual numbers and not enums:
>
> __print_symbolic(REC->oldstate,
> { 1, "TCP_ESTABLISHED" },
> { 2, "TCP_SYN_SENT" },
> { 3, "TCP_SYN_RECV" },
> { 4, "TCP_FIN_WAIT1" },
> { 5, "TCP_FIN_WAIT2" },
> { 6, "TCP_TIME_WAIT" },
> { 7, "TCP_CLOSE" },
> { 8, "TCP_CLOSE_WAIT" },
> { 9, "TCP_LAST_ACK" },
> { 10, "TCP_LISTEN" },
> { 11, "TCP_CLOSING" },
> { 12, "TCP_NEW_SYN_RECV" })
>
> Signed-off-by: Steven Rostedt (VMware) 
> ---
>  include/trace/events/tcp.h | 41 -
>  1 file changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 07a6cbf1..62e5bad7901f 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -9,21 +9,36 @@
>  #include 
>  #include 
>
> +#define tcp_state_names\
> +   EM(TCP_ESTABLISHED) \
> +   EM(TCP_SYN_SENT)\
> +   EM(TCP_SYN_RECV)\
> +   EM(TCP_FIN_WAIT1)   \
> +   EM(TCP_FIN_WAIT2)   \
> +   EM(TCP_TIME_WAIT)   \
> +   EM(TCP_CLOSE)   \
> +   EM(TCP_CLOSE_WAIT)  \
> +   EM(TCP_LAST_ACK)\
> +   EM(TCP_LISTEN)  \
> +   EM(TCP_CLOSING) \
> +   EMe(TCP_NEW_SYN_RECV)
> +
> +/* enums need to be exported to user space */
> +#undef EM
> +#undef EMe
> +#define EM(a) TRACE_DEFINE_ENUM(a);
> +#define EMe(a)TRACE_DEFINE_ENUM(a);
> +
> +tcp_state_names
> +
> +#undef EM
> +#undef EMe
> +#define EM(a) tcp_state_name(a),
> +#define EMe(a)tcp_state_name(a)
> +
>  #define tcp_state_name(state)  { state, #state }
>  #define show_tcp_state_name(val)   \
> -   __print_symbolic(val,   \
> -   tcp_state_name(TCP_ESTABLISHED),\
> -   tcp_state_name(TCP_SYN_SENT),   \
> -   tcp_state_name(TCP_SYN_RECV),   \
> -   tcp_state_name(TCP_FIN_WAIT1),  \
> -   tcp_state_name(TCP_FIN_WAIT2),  \
> -   tcp_state_name(TCP_TIME_WAIT),  \
> -   tcp_state_name(TCP_CLOSE),  \
> -   tcp_state_name(TCP_CLOSE_WAIT), \
> -   tcp_state_name(TCP_LAST_ACK),   \
> -   tcp_state_name(TCP_LISTEN), \
> -   tcp_state_name(TCP_CLOSING),\
> -   tcp_state_name(TCP_NEW_SYN_RECV))
> +   __print_symbolic(val, tcp_state_names)
>
>  /*
>   * tcp event with arguments sk and skb
> --
> 2.13.6
>

Could the macro tcp_state_name() be renamed ？
If  is included in include/net/tcp.h, it will
cause compile error, because there's another function tcp_state_name()
defined in net/netfilter/ipvs/ip_vs_proto_tcp.c.
static const char * tcp_state_name(int state)
{

if (state >= IP_VS_TCP_S_LAST)

return "ERR!";

return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";

}


Thanks
Yafang

Re: [pull request][net-next 0/8] Mellanox, mlx5 updates 2017-11-09

2017-11-09 Thread David Miller

From: Saeed Mahameed 
Date: Thu,  9 Nov 2017 14:15:45 +0900

> This series introduces vlan offloads related improvements for mlx5
> ethernet netdev driver Plus some trivial refactoring patches needed by
> this series, from Gal Pressman.
> 
> For more information please see tag log below.
> 
> Please pull and let me know if there's any problem.

Pulled, thanks Saeed!

Re: [kernel-hardening] Re: [PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread Serge E. Hallyn

Quoting Eric W. Biederman (ebied...@xmission.com):
> single sandbox.  I am not at all certain that the capabilities is the
> proper place to limit code reachability.

Right, I keep having this gut feeling that there is another way we
should be doing that.  Maybe based on ksplice or perf, or maybe more
based on subsystems.  And I hope someone pursues that.  But I can't put
my finger on it, and meanwhile the capability checks obviously *are* in
fact gates...

-serge

Re: [PATCH v4 net-next 0/6] IGMP snooping for local traffic

2017-11-09 Thread David Miller

From: Andrew Lunn 
Date: Thu,  9 Nov 2017 23:10:56 +0100

> The linux bridge supports IGMP snooping. It will listen to IGMP
> reports on bridge ports and keep track of which groups have been
> joined on an interface. It will then forward multicast based on this
> group membership.
> 
> When the bridge adds or removed groups from an interface, it uses
> switchdev to request the hardware add an mdb to a port, so the
> hardware can perform the selective forwarding between ports.
> 
> What is not covered by the current bridge code, is IGMP joins/leaves
> from the host on the brX interface. These are not reported via
> switchdev so that hardware knows the local host is interested in the
> multicast frames.
> 
> Luckily, the bridge does track joins/leaves on the brX interface. The
> code is obfusticated, which is why i missed it with my first attempt.
> So the first patch tries to remove this obfustication. Currently,
> there is no notifications sent when the bridge interface joins a
> group. The second patch adds them. bridge monitor then shows
> joins/leaves in the same way as for other ports of the bridge.
> 
> Then starts the work passing down to the hardware that the host has
> joined/left a group. The existing switchdev mdb object cannot be used,
> since the semantics are different. The existing
> SWITCHDEV_OBJ_ID_PORT_MDB is used to indicate a specific multicast
> group should be forwarded out that port of the switch. However here we
> require the exact opposite. We want multicast frames for the group
> received on the port to the forwarded to the host. Hence add a new
> object SWITCHDEV_OBJ_ID_HOST_MDB, a multicast database entry to
> forward to the host. This new object is then propagated through the
> DSA layers. No DSA driver changes should be needed, this should just
> work...
> 
> This version fixes up the nitpick from Nikolay, removes an unrelated
> white space change, and adds in a patch adding a few const attributes
> to a couple of functions taking a port parameter, in order to stop the
> following patch produces warnings.
> 
> Acked-by: Stephen Hemminger 

Series applied, thanks Andrew.

Re: [PATCH net-next 3/4] net: phy: sfp: Separate enumerations and states

2017-11-09 Thread Florian Fainelli



On 11/08/2017 12:58 AM, Russell King - ARM Linux wrote:
> On Tue, Nov 07, 2017 at 07:49:10PM -0800, Florian Fainelli wrote:
>> Create separate enumerations for the SFP physical state (computed from 
>> GPIOs),
>> device state, module state, and actual state machine. This will make it 
>> easier
>> to make sure the correct states are used, and also pretty print those to help
>> debugging.
> 
> The compiler does no type checking of these, so I don't see how it
> makes it any "easier to make sure the correct states are used".

The types currently used (unsigned char, unsigned short) do not make it
easy to spot what the enumeration is about and what values could be
valid. Overall it seems to me like this improves code readability if
nothing else.
-- 
Florian

Re: [PATCH net-next 1/4] net: phy: sfp: Do not reject soldered down modules

2017-11-09 Thread Florian Fainelli



On 11/08/2017 03:15 AM, Russell King - ARM Linux wrote:
> On Tue, Nov 07, 2017 at 07:49:08PM -0800, Florian Fainelli wrote:
>> The SFP module identification code in sfp_sm_mod_probe() will reject SFF
>> modules soldered down because they have an identified of 0x2, while the code
>> currently checks for 0x3 only (SFP_PHYS_ID_SFP), update that.
>>
>> Signed-off-by: Florian Fainelli 
>> ---
>>  drivers/net/phy/sfp.c | 5 +++--
>>  include/linux/sfp.h   | 1 +
>>  2 files changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
>> index e381811e5f11..942288aa9cdb 100644
>> --- a/drivers/net/phy/sfp.c
>> +++ b/drivers/net/phy/sfp.c
>> @@ -463,8 +463,9 @@ static int sfp_sm_mod_probe(struct sfp *sfp)
>>   vendor, part, rev, sn, date);
>>  
>>  /* We only support SFP modules, not the legacy GBIC modules. */
>> -if (sfp->id.base.phys_id != SFP_PHYS_ID_SFP ||
>> -sfp->id.base.phys_ext_id != SFP_PHYS_EXT_ID_SFP) {
>> +if ((sfp->id.base.phys_id != SFP_PHYS_ID_SFP &&
>> + sfp->id.base.phys_id != SFP_PHYS_ID_SFF) ||
>> + sfp->id.base.phys_ext_id != SFP_PHYS_EXT_ID_SFP) {
> 
> I'd prefer that we do something like the patch I sent a couple of nights
> ago, having a separate compatible for the SFF modules (since they have
> no insert signal as SFF is soldered in place) and use that to decide
> which phys_id we accept here.

Fair enough.
-- 
Florian

Re: [PATCH net-next 2/4] net: phy: sfp: Use correct endian for sfp->id.ext.options

2017-11-09 Thread Florian Fainelli



On 11/08/2017 12:56 AM, Russell King - ARM Linux wrote:
> On Tue, Nov 07, 2017 at 07:49:09PM -0800, Florian Fainelli wrote:
>> The extended ID options 16-bit value is big-endian (and actually annotated as
>> such), but we would be accessing it with our CPU endian, which would not
>> allow the correct detection of whether the LOS signal is inverted or not.
>>
>> Fixes: 73970055450e ("sfp: add SFP module support")
>> Signed-off-by: Florian Fainelli 
>> ---
>>  drivers/net/phy/sfp.c | 8 +---
>>  1 file changed, 5 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
>> index 942288aa9cdb..dfb28b269687 100644
>> --- a/drivers/net/phy/sfp.c
>> +++ b/drivers/net/phy/sfp.c
>> @@ -355,7 +355,7 @@ static void sfp_sm_link_check_los(struct sfp *sfp)
>>   * SFP_OPTIONS_LOS_NORMAL are set?  For now, we assume
>>   * the same as SFP_OPTIONS_LOS_NORMAL set.
>>   */
>> -if (sfp->id.ext.options & SFP_OPTIONS_LOS_INVERTED)
>> +if (be16_to_cpu(sfp->id.ext.options) & SFP_OPTIONS_LOS_INVERTED)
> 
> It would be more efficient to convert the constants to BE16 rather
> than an indeterminant number to CPU endian.  The compiler can optimise
> the constant.  Same for the other two hunks.

Sure, I can do that.
-- 
Florian

Re: linux-next: manual merge of the net-next tree with Linus' tree

2017-11-09 Thread David Miller

From: Stephen Rothwell 
Date: Fri, 10 Nov 2017 10:31:00 +1100

> Hi all,
> 
> Today's linux-next merge of the net-next tree got a conflict in:
> 
>   net/sched/cls_basic.c
>   net/sched/cls_u32.c
> 
> between commits:
> 
>   0b2a59894b76 ("cls_basic: use tcf_exts_get_net() before call_rcu()")
>   35c55fc156d8 ("cls_u32: use tcf_exts_get_net() before call_rcu()")
> 
> from Linus' tree and commit:
> 
>   1d8134fea2eb ("net_sched: use idr to allocate basic filter handles")
> 
> from the net-next tree.

This should be resolved as I've just merged 'net' into 'net-next'.

Re: linux-next: manual merge of the net-next tree with Linus' tree

2017-11-09 Thread Cong Wang

On Thu, Nov 9, 2017 at 3:31 PM, Stephen Rothwell  wrote:
> I fixed it up (I think - see below) and can carry the fix as necessary.
> This is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.

It looks good to me.

Thanks!

Re: [kernel-hardening] Re: [PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 6:58 AM, Eric W. Biederman
 wrote:
> "Mahesh Bandewar (महेश बंडेवार)"  writes:
>
>> [resend response as earlier one failed because of formatting issues]
>>
>> On Thu, Nov 9, 2017 at 12:21 PM, Serge E. Hallyn  wrote:
>>>
>>> On Thu, Nov 09, 2017 at 09:55:41AM +0900, Mahesh Bandewar (महेश बंडेवार) 
>>> wrote:
>>> > On Thu, Nov 9, 2017 at 4:02 AM, Christian Brauner
>>> >  wrote:
>>> > > On Wed, Nov 08, 2017 at 03:09:59AM -0800, Mahesh Bandewar (महेश 
>>> > > बंडेवार) wrote:
>>> > >> Sorry folks I was traveling and seems like lot happened on this 
>>> > >> thread. :p
>>> > >>
>>> > >> I will try to response few of these comments selectively -
>>> > >>
>>> > >> > The thing that makes me hesitate with this set is that it is a
>>> > >> > permanent new feature to address what (I hope) is a temporary
>>> > >> > problem.
>>> > >> I agree this is permanent new feature but it's not solving a temporary
>>> > >> problem. It's impossible to assess what and when new vulnerability
>>> > >> that could show up. I think Daniel summed it up appropriately in his
>>> > >> response
>>> > >>
>>> > >> > Seems like there are two naive ways to do it, the first being to just
>>> > >> > look at all code under ns_capable() plus code called from there.  It
>>> > >> > seems like looking at the result of that could be fruitful.
>>> > >> This is really hard. The main issue that there were features designed
>>> > >> and developed before user-ns days with an assumption that unprivileged
>>> > >> users will never get certain capabilities which only root user gets.
>>> > >> Now that is not true anymore with user-ns creation with mapping root
>>> > >> for any process. Also at the same time blocking user-ns creation for
>>> > >> eveyone is a big-hammer which is not needed too. So it's not that easy
>>> > >> to just perform a code-walk-though and correct those decisions now.
>>> > >>
>>> > >> > It seems to me that the existing control in
>>> > >> > /proc/sys/kernel/unprivileged_userns_clone might be the better duct 
>>> > >> > tape
>>> > >> > in that case.
>>> > >> This solution is essentially blocking unprivileged users from using
>>> > >> the user-namespaces entirely. This is not really a solution that can
>>> > >> work. The solution that this patch-set adds allows unprivileged users
>>> > >> to create user-namespaces. Actually the proposed solution is more
>>> > >> fine-grained approach than the unprivileged_userns_clone solution
>>> > >> since you can selectively block capabilities rather than completely
>>> > >> blocking the functionality.
>>> > >
>>> > > I've been talking to Stéphane today about this and we should also keep 
>>> > > in mind
>>> > > that we have:
>>> > >
>>> > > chb@conventiont|~
>>> > >> ls -al /proc/sys/user/
>>> > > total 0
>>> > > dr-xr-xr-x 1 root root 0 Nov  6 23:32 .
>>> > > dr-xr-xr-x 1 root root 0 Nov  2 22:13 ..
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_cgroup_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_inotify_instances
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_inotify_watches
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_ipc_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_mnt_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_net_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_pid_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_user_namespaces
>>> > > -rw-r--r-- 1 root root 0 Nov  8 19:48 max_uts_namespaces
>>> > >
>>> > > These files allow you to limit the number of namespaces that can be 
>>> > > created
>>> > > *per namespace* type. So let's say your system runs a bunch of user 
>>> > > namespaces
>>> > > you can do:
>>> > >
>>> > > chb@conventiont|~
>>> > >> echo 0 > /proc/sys/user/max_user_namespaces
>>> > >
>>> > > So that the next time you try to create a user namespaces you'd see:
>>> > >
>>> > > chb@conventiont|~
>>> > >> unshare -U
>>> > > unshare: unshare failed: No space left on device
>>> > >
>>> > > So there's not even a need to upstream a new sysctl since we have ways 
>>> > > of
>>> > > blocking this.
>>> > >
>>> > I'm not sure how it's solving the problem that my patch-set is addressing?
>>> > I agree though that the need for unprivileged_userns_clone sysctl goes
>>> > away as this is equivalent to setting that sysctl to 0 as you have
>>> > described above.
>>>
>>> oh right that was the reasoning iirc for not needing the other sysctl.
>>>
>>> > However as I mentioned earlier, blocking processes from creating
>>> > user-namespaces is not the solution. Processes should be able to
>>> > create namespaces as they are designed but at the same time we need to
>>> > have controls to 'contain' them if a need arise. Setting max_no to 0
>>> > is not the solution that I'm looking for since it doesn't solve the
>>> > problem.
>>>
>>> well yesterday we were told that was explicitly not the goal, but that was

Re: [PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread Serge E. Hallyn

Quoting Mahesh Bandewar (महेश बंडेवार) (mahe...@google.com):
...
> >>
> >>  ==
> >>
> >> +controlled_userns_caps_whitelist
> >> +
> >> +Capability mask that is whitelisted for "controlled" user namespaces.
> >> +Any capability that is missing from this mask will not be allowed to
> >> +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
> >> +is not part of this mask, then processes running inside any controlled
> >> +userns's will not be allowed to perform action that needs CAP_NET_RAW
> >> +capability. However, processes that are attached to a parent user-ns
> >> +hierarchy that is *not* controlled and has CAP_NET_RAW can continue
> >> +performing those actions. User-namespaces are marked "controlled" at
> >> +the time of their creation based on the capabilities of the creator.
> >> +A process that does not have CAP_SYS_ADMIN will create user-namespaces
> >> +that are controlled.
> >
> > Hm.  I think that's fine (the way 'controlled' user namespaces are
> > defined), but that is design decision in itself, and should perhaps be
> > discussed.
> >
> > Did you consider other ways?  What about using CAP_SETPCAP?
> >
> I did try other ways e.g. using another bounding-set etc. but
> eventually settled with this approach because of main two properties -

No, I meant did you try other ways of defining a controlled user
namespace, other than one which is created by a task lacking
CAP_SYS_ADMIN?

...

> >> +The value is expressed as two comma separated hex words (u32). This
> >
> > Why comma separated?  whitespace ok?  Leading 0x ok?  What is the
> > default at boot?  (Obviously the patch tells me, I'm asking for it
> > to be spelled out in the doc)
> >
> I tried multiple ways including representing capabilities in
> string/name form for better readability but didn't want to add
> additional complexities of dealing with strings and possible
> string-related-issues for this. Also didn't want to reinvent the new
> form so settled with something that is widely used (cpu
> bounding/affinity/irq mapping etc.) and is capable of handling growing
> bit set (currently 37 but possibly more later).

Ok, thanks.

[RFC] hv_netvsc: safer orderly shutdown

2017-11-09 Thread Stephen Hemminger


Several types of control operations require that the underlying RNDIS
infrastructure be restarted. This patch changes the ordering of the
shutdown to avoid race conditions.
Stop all transmits before doing RNDIS halt. This involves stopping the
network device transmit queues, then waiting for all outstanding
sends before informing host to halt.

Also, check for successful restart of the device when after the
change is done.

For review, not tested on Hyper-V yet.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c   | 40 ++-
 drivers/net/hyperv/rndis_filter.c | 23 +++---
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index da216ca4f2b2..3afa082e093d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -855,8 +855,10 @@ static int netvsc_set_channels(struct net_device *net,
 
orig = nvdev->num_chn;
was_opened = rndis_filter_opened(nvdev);
-   if (was_opened)
+   if (was_opened) {
+   netif_tx_disable(net);
rndis_filter_close(nvdev);
+   }
 
memset(_info, 0, sizeof(device_info));
device_info.num_chn = count;
@@ -881,8 +883,13 @@ static int netvsc_set_channels(struct net_device *net,
}
}
 
-   if (was_opened)
-   rndis_filter_open(nvdev);
+   if (was_opened) {
+   ret = rndis_filter_open(nvdev);
+   if (ret)
+   netdev_err(net, "reopening device failed: %d\n", ret);
+   else
+   netif_tx_start_all_queues(net);
+   }
 
/* We may have missed link change notifications */
net_device_ctx->last_reconfig = 0;
@@ -971,8 +978,10 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
 
netif_device_detach(ndev);
was_opened = rndis_filter_opened(nvdev);
-   if (was_opened)
+   if (was_opened) {
+   netif_tx_disable(net);
rndis_filter_close(nvdev);
+   }
 
memset(_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
@@ -1004,8 +1013,13 @@ static int netvsc_change_mtu(struct net_device *ndev, 
int mtu)
}
}
 
-   if (was_opened)
-   rndis_filter_open(nvdev);
+   if (was_opened) {
+   ret = rndis_filter_open(nvdev);
+   if (ret)
+   netdev_err(net, "reopening device failed: %d\n", ret);
+   else
+   netif_tx_start_all_queues(net);
+   }
 
netif_device_attach(ndev);
 
@@ -1547,8 +1561,10 @@ static int netvsc_set_ringparam(struct net_device *ndev,
 
netif_device_detach(ndev);
was_opened = rndis_filter_opened(nvdev);
-   if (was_opened)
+   if (was_opened) {
+   netif_tx_disable(net);
rndis_filter_close(nvdev);
+   }
 
rndis_filter_device_remove(hdev, nvdev);
 
@@ -1566,8 +1582,14 @@ static int netvsc_set_ringparam(struct net_device *ndev,
}
}
 
-   if (was_opened)
-   rndis_filter_open(nvdev);
+   if (was_opened) {
+   ret = rndis_filter_open(nvdev);
+   if (ret)
+   netdev_err(net, "reopening device failed: %d\n", ret);
+   else
+   netif_tx_start_all_queues(net);
+   }
+
netif_device_attach(ndev);
 
/* We may have missed link change notifications */
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 0648eebda829..164f5ffe9c50 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -948,11 +948,20 @@ static void rndis_filter_halt_device(struct rndis_device 
*dev)
struct net_device_context *net_device_ctx = netdev_priv(dev->ndev);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
 
+   /* tell bottom half that deice is being closed */
+   nvdev->destroy = true;
+
+   /* Force flag to be ordered before waiting */
+   wmb();
+
+   /* Wait for all send completions */
+   wait_event(nvdev->wait_drain, netvsc_device_idle(nvdev));
+
/* Attempt to do a rndis device halt */
request = get_rndis_request(dev, RNDIS_MSG_HALT,
RNDIS_MESSAGE_SIZE(struct rndis_halt_request));
if (!request)
-   goto cleanup;
+   return;
 
/* Setup the rndis set */
halt = >request_msg.msg.halt_req;
@@ -963,17 +972,7 @@ static void rndis_filter_halt_device(struct rndis_device 
*dev)
 
dev->state = RNDIS_DEV_UNINITIALIZED;
 
-cleanup:
-   nvdev->destroy = true;
-
-   /* Force flag to be ordered before waiting */
-   wmb();
-
-   /* Wait for all send completions */
-

Re: [PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 2:30 AM, Serge E. Hallyn  wrote:
> Quoting Mahesh Bandewar (mah...@bandewar.net):
>> From: Mahesh Bandewar 
>>
>> Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
>
> I understand the arguments in favor of whitelists in most cases for
> security purposes.  But given that you've said the goal here is to
> prevent use of a capability in a user namespace when a CVE has been
> found, a whitelist seems the wrong choice, since
>
> 1. it means that an attacker may through some other means be able
> to add a capability back into the whitelist when you specifically
> wanted to drop it.  With a blacklist, you could say "once a cap has
> been dropped it can never be re-added without rebooting".
> 2. it means by default all capabilities will be denied once the
> switch is pulled which is specifically not what you want in this
> case.
> 3. the admin can't just say "drop CAP_NET_ADMIN", but needs to
> know to echo ~CAP_NET_ADMIN.
>
> Why not make it a blacklist, and once a cap is dropped it can
> never be re-added?
>
Well, I'm not going to deny that blacklist approach would work equally
well but code becomes little simpler when you use the whitelist
approach. especially less complicated when a new capability needs to
be added (not that we add capabilities very often) but that would be
something one would have to pay attention to. However with this
approach I can just the CAP_FULL_SET which is readily available.

Having said that I specifically don't have strong preference in this
regard (whitelist vs. blacklist).

> -serge

RE: [Intel-wired-lan] [PATCH] i40e: remove redundant initialization of read_size

2017-11-09 Thread Brown, Aaron F

> From: Brown, Aaron F
> Sent: Thursday, November 9, 2017 7:16 PM
> To: 'Colin King' ; Kirsher, Jeffrey T
> ; intel-wired-...@lists.osuosl.org;
> netdev@vger.kernel.org
> Cc: kernel-janit...@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: RE: [Intel-wired-lan] [PATCH] i40e: remove redundant initialization
> of read_size
> 
> > From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf
> > Of Colin King
> > Sent: Sunday, November 5, 2017 5:04 AM
> > To: Kirsher, Jeffrey T ; intel-wired-
> > l...@lists.osuosl.org; netdev@vger.kernel.org
> > Cc: kernel-janit...@vger.kernel.org; linux-ker...@vger.kernel.org
> > Subject: [Intel-wired-lan] [PATCH] i40e: remove redundant initialization of
> > read_size
> >
> > From: Colin Ian King 
> >
> > Variable read_size is initialized and this value is never read, it is
> > instead set inside the do-loop, hence the intialization is redundant
> > and can be removed. Cleans up clang warning:
> >
> > drivers/net/ethernet/intel/i40e/i40e_nvm.c:390:6: warning: Value stored
> > to 'read_size' during its initialization is never read
> >
> > Signed-off-by: Colin Ian King 
> 
> s/intialization/initialization/g

Other than that typo, looks fine:
Tested-by: Aaron Brown

Re: [PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 2:22 AM, Serge E. Hallyn  wrote:
> Quoting Mahesh Bandewar (mah...@bandewar.net):
>> From: Mahesh Bandewar 
>>
>> Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
>> takes input as capability mask expressed as two comma separated hex
>> u32 words. The mask, however, is stored in kernel as kernel_cap_t type.
>>
>> Any capabilities that are not part of this mask will be controlled and
>> will not be allowed to processes in controlled user-ns.
>>
>> Signed-off-by: Mahesh Bandewar 
>> ---
>>  Documentation/sysctl/kernel.txt | 21 ++
>>  include/linux/capability.h  |  3 +++
>>  kernel/capability.c | 47 
>> +
>>  kernel/sysctl.c |  5 +
>>  4 files changed, 76 insertions(+)
>>
>> diff --git a/Documentation/sysctl/kernel.txt 
>> b/Documentation/sysctl/kernel.txt
>> index 694968c7523c..a1d39dbae847 100644
>> --- a/Documentation/sysctl/kernel.txt
>> +++ b/Documentation/sysctl/kernel.txt
>> @@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
>>  - bootloader_version  [ X86 only ]
>>  - callhome[ S390 only ]
>>  - cap_last_cap
>> +- controlled_userns_caps_whitelist
>>  - core_pattern
>>  - core_pipe_limit
>>  - core_uses_pid
>> @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
>>
>>  ==
>>
>> +controlled_userns_caps_whitelist
>> +
>> +Capability mask that is whitelisted for "controlled" user namespaces.
>> +Any capability that is missing from this mask will not be allowed to
>> +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
>> +is not part of this mask, then processes running inside any controlled
>> +userns's will not be allowed to perform action that needs CAP_NET_RAW
>> +capability. However, processes that are attached to a parent user-ns
>> +hierarchy that is *not* controlled and has CAP_NET_RAW can continue
>> +performing those actions. User-namespaces are marked "controlled" at
>> +the time of their creation based on the capabilities of the creator.
>> +A process that does not have CAP_SYS_ADMIN will create user-namespaces
>> +that are controlled.
>
> Hm.  I think that's fine (the way 'controlled' user namespaces are
> defined), but that is design decision in itself, and should perhaps be
> discussed.
>
> Did you consider other ways?  What about using CAP_SETPCAP?
>
I did try other ways e.g. using another bounding-set etc. but
eventually settled with this approach because of main two properties -
(a) This has creation time settings which can be turned on/off at
runtime (b) the run-time knob actually controls the behavior which can
range from no-op to very-drastic without needing the applications to
change and controlled by admin. Also there are always more than one
ways of solving the problem and there possibly could be better
alternative and I don't deny that. :/

Controlling individual capabilities are going to give very different
experience. So how the behavior of the process going to be for a
specific capability is probably out-of-scope for this patch-set. I
would like to offload that responsibility to the admin, as he/she
would be the best judge and knowledgable of the situation /
environment. This should be used as a tool to gain control.

>> +The value is expressed as two comma separated hex words (u32). This
>
> Why comma separated?  whitespace ok?  Leading 0x ok?  What is the
> default at boot?  (Obviously the patch tells me, I'm asking for it
> to be spelled out in the doc)
>
I tried multiple ways including representing capabilities in
string/name form for better readability but didn't want to add
additional complexities of dealing with strings and possible
string-related-issues for this. Also didn't want to reinvent the new
form so settled with something that is widely used (cpu
bounding/affinity/irq mapping etc.) and is capable of handling growing
bit set (currently 37 but possibly more later).

> Otherwise looks good, thanks!
>
> Serge
>
>> +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
>> +are allowed to make changes.
>> +
>> +==
>> +
>>  core_pattern:
>>
>>  core_pattern is used to specify a core dumpfile pattern name.
>> diff --git a/include/linux/capability.h b/include/linux/capability.h
>> index b52e278e4744..6c0b9677c03f 100644
>> --- a/include/linux/capability.h
>> +++ b/include/linux/capability.h
>> @@ -13,6 +13,7 @@
>>  #define _LINUX_CAPABILITY_H
>>
>>  #include 
>> +#include 
>>
>>
>>  #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
>> @@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, 
>> struct user_namespace *ns);
>>
>>  /* audit system wants to get cap info from files as well */
>>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
>> cpu_vfs_cap_data

RE: [Intel-wired-lan] [PATCH] i40e: remove redundant initialization of read_size

2017-11-09 Thread Brown, Aaron F

> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On Behalf
> Of Colin King
> Sent: Sunday, November 5, 2017 5:04 AM
> To: Kirsher, Jeffrey T ; intel-wired-
> l...@lists.osuosl.org; netdev@vger.kernel.org
> Cc: kernel-janit...@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: [Intel-wired-lan] [PATCH] i40e: remove redundant initialization of
> read_size
> 
> From: Colin Ian King 
> 
> Variable read_size is initialized and this value is never read, it is
> instead set inside the do-loop, hence the intialization is redundant
> and can be removed. Cleans up clang warning:
> 
> drivers/net/ethernet/intel/i40e/i40e_nvm.c:390:6: warning: Value stored
> to 'read_size' during its initialization is never read
> 
> Signed-off-by: Colin Ian King 

s/intialization/initialization/g

Re: kernel BUG at net/key/af_key.c:LINE!

2017-11-09 Thread Herbert Xu

On Fri, Nov 10, 2017 at 01:30:38PM +1100, Herbert Xu wrote:
> 
> I found the problem.  This crap is coming from clone_policy.  Now
> let me where this code came from.

---8<---
Subject: xfrm: Copy policy family in clone_policy

The syzbot found an ancient bug in the IPsec code.  When we cloned
a socket policy (for example, for a child TCP socket derived from a
listening socket), we did not copy the family field.  This results
in a live policy with a zero family field.  This triggers a BUG_ON
check in the af_key code when the cloned policy is retrieved.

This patch fixes it by copying the family field over.

Reported-by: syzbot 
Signed-off-by: Herbert Xu 

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8cafb3c..c238959 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1306,6 +1306,7 @@ static struct xfrm_policy *clone_policy(const struct 
xfrm_policy *old, int dir)
newp->xfrm_nr = old->xfrm_nr;
newp->index = old->index;
newp->type = old->type;
+   newp->family = old->family;
memcpy(newp->xfrm_vec, old->xfrm_vec,
   newp->xfrm_nr*sizeof(struct xfrm_tmpl));
spin_lock_bh(>xfrm.xfrm_policy_lock);
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [kernel-hardening] [PATCH v4] scripts: add leaking_addresses.pl

2017-11-09 Thread Kaiwan N Billimoria

>
> Yes, profiling and tracing are similar. And you need to be root to run
> the recording anyway. Thus, as long as root user can read kallsyms,
> trace-cmd should be fine. As trace-cmd requires root access to do any
> ftrace tracing.
>
> -- Steve
Got it, thanks..

[PATCH net-next] tcp: allow drivers to tweak TSQ logic

2017-11-09 Thread Eric Dumazet

From: Eric Dumazet 

I had many reports that TSQ logic breaks wifi aggregation.

Current logic is to allow up to 1 ms of bytes to be queued into qdisc
and drivers queues.

But Wifi aggregation needs a bigger budget to allow bigger rates to
be discovered by various TCP Congestion Controls algorithms.

This patch adds an extra socket field, allowing wifi drivers to select
another log scale to derive TCP Small Queue credit from current pacing
rate.

Initial value is 10, meaning that this patch does not change current
behavior.

We expect wifi drivers to set this field to smaller values (tests have
been done with values from 6 to 9)

They would have to use following template :

if (skb->sk && skb->sk->sk_pacing_shift != MY_PACING_SHIFT)
 skb->sk->sk_pacing_shift = MY_PACING_SHIFT;


Ref: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1670041
Signed-off-by: Eric Dumazet 
Cc: Johannes Berg 
Cc: Toke Høiland-Jørgensen 
Cc: Kir Kolyshkin 
---
 include/net/sock.h|1 +
 net/core/sock.c   |1 +
 net/ipv4/tcp_output.c |4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 
688a823dccc306bd21f47da167c6922161af5a6a..fb0e5194a3bce61fac00fc234d2a5d1bb3c60f35
 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -451,6 +451,7 @@ struct sock {
kmemcheck_bitfield_end(flags);
 
u16 sk_gso_max_segs;
+   u8  sk_pacing_shift;
unsigned long   sk_lingertime;
struct proto*sk_prot_creator;
rwlock_tsk_callback_lock;
diff --git a/net/core/sock.c b/net/core/sock.c
index 
c59bcf90d90536fedc7809e397f6bd414781b529..2811ff8322d4a5f68e3e745cf585564e1ec5d809
 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2746,6 +2746,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
sk->sk_max_pacing_rate = ~0U;
sk->sk_pacing_rate = ~0U;
+   sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
/*
 * Before updating sk_refcnt, we must commit prior changes to memory
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 
9b98d35aa0d8d0a829e4a41985d805d4e2895a8e..fa5e7b81b5ec12039b1347474f5183b1d9c87887
 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1737,7 +1737,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int 
mss_now,
 {
u32 bytes, segs;
 
-   bytes = min(sk->sk_pacing_rate >> 10,
+   bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
 
/* Goal is to send at least one packet per ms,
@@ -2215,7 +2215,7 @@ static bool tcp_small_queue_check(struct sock *sk, const 
struct sk_buff *skb,
 {
unsigned int limit;
 
-   limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+   limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 
sk->sk_pacing_shift);
limit = min_t(u32, limit,
  sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;

Re: rsi: rsi_91x_ps: remove redundant code in str_psstate

2017-11-09 Thread Kalle Valo

"Gustavo A. R. Silva"  wrote:

> "INVALID_STATE" is already being returned in the default case and this
> code cannot be reached.
> 
> Addresses-Coverity-ID: 1398384
> Signed-off-by: Gustavo A. R. Silva 

Patch applied to wireless-drivers-next.git, thanks.

4775ae7afec6 rsi: rsi_91x_ps: remove redundant code in str_psstate

-- 
https://patchwork.kernel.org/patch/10044571/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

Re: rt2x00: use monotonic timestamps for frame dump

2017-11-09 Thread Kalle Valo

Arnd Bergmann  wrote:

> rt2x00 uses the deprecated do_gettimeofday() function to get a timestamp
> for its debugfs "dump" file interface.
> 
> The timestamp is using an unsigned 32-bit value, so we could make it
> work until 2106 by using ktime_get_real_ts64(), but it seems better to
> use monotonic times, as we normally want for timestamps.
> 
> Since this is an interface change, I'm incrementing the
> DUMP_HEADER_VERSION number, so user space can figure out whether the
> timestamps are monotonic or not. Most likely the tools won't care either
> way.
> 
> Generally speaking, ABI version numbers and in particular changing them
> is a bad idea. However since this is in debugfs, we don't put any
> API stability rules on the interface according to
> Documentation/filesystems/debugfs.txt, and we can take the easy way
> out here; anyone using the frame dump feature can probably work out
> the differences here.
> 
> Signed-off-by: Arnd Bergmann 

Patch applied to wireless-drivers-next.git, thanks.

f87eba996bac rt2x00: use monotonic timestamps for frame dump

-- 
https://patchwork.kernel.org/patch/10043531/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

Re: kernel BUG at net/key/af_key.c:LINE!

2017-11-09 Thread Herbert Xu

On Fri, Nov 10, 2017 at 01:11:45PM +1100, Herbert Xu wrote:
>
> Oh and this is an important clue.  We have two policies with
> identical index values.  The index value is meant to be unique
> so clearly something funny is going on.

I found the problem.  This crap is coming from clone_policy.  Now
let me where this code came from.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH 0/4] make function arg and structures as const

2017-11-09 Thread J. Bruce Fields

On Tue, Oct 17, 2017 at 12:40:27PM -0400, Jeff Layton wrote:
> On Tue, 2017-10-17 at 18:14 +0200, Bhumika Goyal wrote:
> > Make the function argument as const. After thing change, make
> > the cache_detail structures as const.
> > 
> > Bhumika Goyal (4):
> >   sunrpc: make the function arg as const
> >   NFS: make cache_detail structures const
> >   NFSD: make cache_detail structures const
> >   SUNRPC: make cache_detail structures const
> > 
> >  fs/nfs/dns_resolve.c  | 2 +-
> >  fs/nfsd/export.c  | 4 ++--
> >  fs/nfsd/nfs4idmap.c   | 4 ++--
> >  include/linux/sunrpc/cache.h  | 2 +-
> >  net/sunrpc/auth_gss/svcauth_gss.c | 4 ++--
> >  net/sunrpc/cache.c| 2 +-
> >  net/sunrpc/svcauth_unix.c | 4 ++--
> >  7 files changed, 11 insertions(+), 11 deletions(-)
> > 
> 
> Looks pretty straightforward. You can add this to the set:
> 
> Reviewed-by: Jeff Layton 

Thanks, I've applied 1, 3, and 4 and could take #2 as well if it's OK
with Trond/Anna.

--b.

Re: kernel BUG at net/key/af_key.c:LINE!

2017-11-09 Thread Herbert Xu

On Fri, Nov 10, 2017 at 01:04:59PM +1100, Herbert Xu wrote:
> 
> By castrating the reproducer to not perform a pfkey dump I have
> captured the corrupted policy via xfrm:
> 
> src ???/0 dst ???/0 uid 0
> socket in action allow index 2083 priority 0 ptype main share any 
> flag  (0x)
> lifetime config:
>   limit: soft 0(bytes), hard 0(bytes)
>   limit: soft 0(packets), hard 0(packets)
>   expire add: soft 0(sec), hard 0(sec)
>   expire use: soft 0(sec), hard 0(sec)
> lifetime current:
>   0(bytes), 0(packets)
>   add 2017-11-10 09:58:17 use 2017-11-10 09:58:20
> tmpl src ac14:bb:: dst ::
> proto 0 spi 0x(0) reqid 0(0x) mode transport
> level 5 share any 
> enc-mask  auth-mask  comp-mask 
> 
> For comparison here is a good policy that was also created by the
> reproducer:
> 
> src fe80::bb/0 dst ::/0 uid 0
> socket in action allow index 2083 priority 0 ptype main share any 
> flag  (0x)
> lifetime config:
>   limit: soft 0(bytes), hard 0(bytes)
>   limit: soft 0(packets), hard 0(packets)
>   expire add: soft 0(sec), hard 0(sec)
>   expire use: soft 0(sec), hard 0(sec)
> lifetime current:
>   0(bytes), 0(packets)
>   add 2017-11-10 09:58:17 use 2017-11-10 09:58:17
> tmpl src ac14:bb:: dst ::
> proto 0 spi 0x(0) reqid 0(0x) mode transport
> level 5 share any 
> enc-mask  auth-mask  comp-mask 

Oh and this is an important clue.  We have two policies with
identical index values.  The index value is meant to be unique
so clearly something funny is going on.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: kernel BUG at net/key/af_key.c:LINE!

2017-11-09 Thread Herbert Xu

On Thu, Nov 09, 2017 at 10:38:57PM +1100, Herbert Xu wrote:
> 
> The xfrm code path is meant to forbid the creation of such a policy.
> I don't currently see how this is bypassing that check.  But
> clearly it has found a way through the check since it's crashing.

By castrating the reproducer to not perform a pfkey dump I have
captured the corrupted policy via xfrm:

src ???/0 dst ???/0 uid 0
socket in action allow index 2083 priority 0 ptype main share any flag  
(0x)
lifetime config:
  limit: soft 0(bytes), hard 0(bytes)
  limit: soft 0(packets), hard 0(packets)
  expire add: soft 0(sec), hard 0(sec)
  expire use: soft 0(sec), hard 0(sec)
lifetime current:
  0(bytes), 0(packets)
  add 2017-11-10 09:58:17 use 2017-11-10 09:58:20
tmpl src ac14:bb:: dst ::
proto 0 spi 0x(0) reqid 0(0x) mode transport
level 5 share any 
enc-mask  auth-mask  comp-mask 

For comparison here is a good policy that was also created by the
reproducer:

src fe80::bb/0 dst ::/0 uid 0
socket in action allow index 2083 priority 0 ptype main share any flag  
(0x)
lifetime config:
  limit: soft 0(bytes), hard 0(bytes)
  limit: soft 0(packets), hard 0(packets)
  expire add: soft 0(sec), hard 0(sec)
  expire use: soft 0(sec), hard 0(sec)
lifetime current:
  0(bytes), 0(packets)
  add 2017-11-10 09:58:17 use 2017-11-10 09:58:17
tmpl src ac14:bb:: dst ::
proto 0 spi 0x(0) reqid 0(0x) mode transport
level 5 share any 
enc-mask  auth-mask  comp-mask 

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH v4 net-next 0/6] IGMP snooping for local traffic

2017-11-09 Thread Nikolay Aleksandrov

On 10.11.2017 00:10, Andrew Lunn wrote:
> The linux bridge supports IGMP snooping. It will listen to IGMP
> reports on bridge ports and keep track of which groups have been
> joined on an interface. It will then forward multicast based on this
> group membership.
> 
> When the bridge adds or removed groups from an interface, it uses
> switchdev to request the hardware add an mdb to a port, so the
> hardware can perform the selective forwarding between ports.
> 
> What is not covered by the current bridge code, is IGMP joins/leaves
> from the host on the brX interface. These are not reported via
> switchdev so that hardware knows the local host is interested in the
> multicast frames.
> 
> Luckily, the bridge does track joins/leaves on the brX interface. The
> code is obfusticated, which is why i missed it with my first attempt.
> So the first patch tries to remove this obfustication. Currently,
> there is no notifications sent when the bridge interface joins a
> group. The second patch adds them. bridge monitor then shows
> joins/leaves in the same way as for other ports of the bridge.
> 
> Then starts the work passing down to the hardware that the host has
> joined/left a group. The existing switchdev mdb object cannot be used,
> since the semantics are different. The existing
> SWITCHDEV_OBJ_ID_PORT_MDB is used to indicate a specific multicast
> group should be forwarded out that port of the switch. However here we
> require the exact opposite. We want multicast frames for the group
> received on the port to the forwarded to the host. Hence add a new
> object SWITCHDEV_OBJ_ID_HOST_MDB, a multicast database entry to
> forward to the host. This new object is then propagated through the
> DSA layers. No DSA driver changes should be needed, this should just
> work...
> 
> This version fixes up the nitpick from Nikolay, removes an unrelated
> white space change, and adds in a patch adding a few const attributes
> to a couple of functions taking a port parameter, in order to stop the
> following patch produces warnings.
> 
> Acked-by: Stephen Hemminger 
> 
> Andrew Lunn (6):
>   net: bridge: Rename mglist to host_joined
>   net: bridge: Send notification when host join/leaves a group
>   net: bridge: Add/del switchdev object on host join/leave
>   net: dsa: slave: Handle switchdev host mdb add/del
>   net: dsa: add more const attributes
>   net: dsa: switch: Don't add CPU port to an mdb by default
> 
>  include/net/switchdev.h   |  1 +
>  net/bridge/br_input.c |  2 +-
>  net/bridge/br_mdb.c   | 54 
> +++
>  net/bridge/br_multicast.c | 18 ++--
>  net/bridge/br_private.h   |  2 +-
>  net/dsa/dsa_priv.h|  4 ++--
>  net/dsa/port.c|  6 +++---
>  net/dsa/slave.c   | 13 
>  net/dsa/switch.c  |  2 +-
>  net/switchdev/switchdev.c |  2 ++
>  10 files changed, 85 insertions(+), 19 deletions(-)
> 

Andrew, overall looks good to me, thanks for keeping the acks and
incorporating the changes. Just one note - in the future please add the
reviewers to the CC list. I've been reviewing and suggesting changes to
this set since its RFC/WIP version but it's making it harder to track
when I'm not receiving the new versions and have to search for them on
netdev.

Thanks,
 Nik

Re: [PATCH net-next v2 02/15] bpf: offload: add infrastructure for loading programs for a specific netdev

2017-11-09 Thread Jakub Kicinski

Hi!

Sorry for the delay!

On Mon, 06 Nov 2017 18:32:45 +0100, Daniel Borkmann wrote:
> On 11/03/2017 09:56 PM, Jakub Kicinski wrote:
> > @@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct 
> > *work)
> > struct bpf_prog_aux *aux;
> >
> > aux = container_of(work, struct bpf_prog_aux, work);
> > +   if (bpf_prog_is_dev_bound(aux))
> > +   bpf_prog_offload_destroy(aux->prog);
> > bpf_jit_free(aux->prog);
> >   }  
> [...]
> > +static int bpf_offload_notification(struct notifier_block *notifier,
> > +   ulong event, void *ptr)
> > +{
> > +   struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
> > +   struct bpf_dev_offload *offload, *tmp;
> > +
> > +   ASSERT_RTNL();
> > +
> > +   switch (event) {
> > +   case NETDEV_UNREGISTER:
> > +   list_for_each_entry_safe(offload, tmp, _prog_offload_devs,
> > +offloads) {
> > +   if (offload->netdev == netdev)
> > +   __bpf_prog_offload_destroy(offload->prog);  
> 
> We would be calling this twice, right? Once here and then on prog
> destruction again. __bpf_prog_offload_destroy() looks it will handle
> this just fine, but we should probably add a comment to
> __bpf_prog_offload_destroy() such that when changes are made to it
> it's obvious that we need to be extra careful.

Good point, I will add the comment.

> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index 323be2473c4b..1574b9f0f24e 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, 
> > struct bpf_prog *prog)
> > if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
> > return -EINVAL;
> >
> > -   prog->aux->ops = bpf_prog_types[type];
> > +   if (!bpf_prog_is_dev_bound(prog->aux))
> > +   prog->aux->ops = bpf_prog_types[type];
> > +   else
> > +   prog->aux->ops = _offload_prog_ops;
> > prog->type = type;
> > return 0;
> >   }
> > @@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct 
> > bpf_prog *prog)
> >   }
> >   EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
> >
> > -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
> > +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type 
> > *attach_type)
> >   {
> > struct fd f = fdget(ufd);
> > struct bpf_prog *prog;
> > @@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum 
> > bpf_prog_type *type)
> > prog = bpf_prog_get(f);
> > if (IS_ERR(prog))
> > return prog;
> > -   if (type && prog->type != *type) {
> > +   if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
> > prog = ERR_PTR(-EINVAL);
> > goto out;
> > }
> > @@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum 
> > bpf_prog_type type)
> >   EXPORT_SYMBOL_GPL(bpf_prog_get_type);
> >
> >   /* last field in 'union bpf_attr' used by this command */
> > -#defineBPF_PROG_LOAD_LAST_FIELD prog_name
> > +#defineBPF_PROG_LOAD_LAST_FIELD prog_target_ifindex  
> 
> For program types that are neither XDP nor cls_bpf, we should reject
> the request if something calls bpf(2) with non-0 prog_target_ifindex.
> 
> That way, i) we don't burn the whole field and could perhaps reuse/union
> it for other prog types like tracing in future. Probably makes sense to
> do anyway since ii) for types like tracing, we would want to reject this
> upfront here and not when later attach happens.
> 
> I probably missed something when reading the code, but if I spotted
> that correctly, we might otherwise even go and nfp-jit simple progs
> for non-networking types (we would bail out later though on in
> __bpf_prog_get() ... but we shouldn't let syscall return in first
> place)?

Agreed, I will fix this.

Re: [PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread महेश बंडेवार

On Fri, Nov 10, 2017 at 2:25 AM, Serge E. Hallyn  wrote:
> Quoting Mahesh Bandewar (mah...@bandewar.net):
>> From: Mahesh Bandewar 
>>
>> With this new notion of "controlled" user-namespaces, the controlled
>> user-namespaces are marked at the time of their creation while the
>> capabilities of processes that belong to them are controlled using the
>> global mask.
>>
>> Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
>> that belongs to uncontrolled user-ns can create another (child) user-
>> namespace that is uncontrolled. Any other process (that either does
>> not have SYS_ADMIN or belongs to a controlled user-ns) can only
>> create a user-ns that is controlled.
>>
>> global-capability-whitelist (controlled_userns_caps_whitelist) is used
>> at the capability check-time and keeps the semantics for the processes
>> that belong to uncontrolled user-ns as it is. Processes that belong to
>> controlled user-ns however are subjected to different checks-
>>
>>(a) if the capability in question is controlled and process belongs
>>to controlled user-ns, then it's always denied.
>>(b) if the capability in question is NOT controlled then fall back
>>to the traditional check.
>>
>> Signed-off-by: Mahesh Bandewar 
>> ---
>>  include/linux/capability.h |  1 +
>>  include/linux/user_namespace.h | 20 
>>  kernel/capability.c|  5 +
>>  kernel/user_namespace.c|  3 +++
>>  security/commoncap.c   |  8 
>>  5 files changed, 37 insertions(+)
>>
>> diff --git a/include/linux/capability.h b/include/linux/capability.h
>> index 6c0b9677c03f..b8c6cac18658 100644
>> --- a/include/linux/capability.h
>> +++ b/include/linux/capability.h
>> @@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, 
>> struct user_namespace *ns);
>>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
>> cpu_vfs_cap_data *cpu_caps);
>>  int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
>>void __user *buff, size_t *lenp, loff_t 
>> *ppos);
>> +bool is_capability_controlled(int cap);
>>
>>  extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
>> size);
>>
>> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
>> index c18e01252346..e890fe81b47e 100644
>> --- a/include/linux/user_namespace.h
>> +++ b/include/linux/user_namespace.h
>> @@ -22,6 +22,7 @@ struct uid_gid_map {/* 64 bytes -- 1 cache line */
>>  };
>>
>>  #define USERNS_SETGROUPS_ALLOWED 1UL
>> +#define USERNS_CONTROLLED 2UL
>>
>>  #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
>>
>> @@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace 
>> *ns)
>>   __put_user_ns(ns);
>>  }
>>
>> +static inline bool is_user_ns_controlled(const struct user_namespace *ns)
>> +{
>> + return ns->flags & USERNS_CONTROLLED;
>> +}
>> +
>> +static inline void mark_user_ns_controlled(struct user_namespace *ns)
>> +{
>> + ns->flags |= USERNS_CONTROLLED;
>> +}
>> +
>>  struct seq_operations;
>>  extern const struct seq_operations proc_uid_seq_operations;
>>  extern const struct seq_operations proc_gid_seq_operations;
>> @@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
>> ns_common *ns)
>>  {
>>   return ERR_PTR(-EPERM);
>>  }
>> +
>> +static inline bool is_user_ns_controlled(const struct user_namespace *ns)
>> +{
>> + return false;
>> +}
>> +
>> +static inline void mark_user_ns_controlled(struct user_namespace *ns)
>> +{
>> +}
>>  #endif
>>
>>  #endif /* _LINUX_USER_H */
>> diff --git a/kernel/capability.c b/kernel/capability.c
>> index 62dbe3350c1b..40a38cc4ff43 100644
>> --- a/kernel/capability.c
>> +++ b/kernel/capability.c
>> @@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
>> user_namespace *ns)
>>  }
>>
>>  /* Controlled-userns capabilities routines */
>> +bool is_capability_controlled(int cap)
>> +{
>> + return !cap_raised(controlled_userns_caps_whitelist, cap);
>> +}
>> +
>>  #ifdef CONFIG_SYSCTL
>>  int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
>>void __user *buff, size_t *lenp, loff_t *ppos)
>> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
>> index c490f1e4313b..f393ea5108f0 100644
>> --- a/kernel/user_namespace.c
>> +++ b/kernel/user_namespace.c
>> @@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
>> user_namespace *user_ns)
>>   cred->cap_effective = CAP_FULL_SET;
>>   cred->cap_ambient = CAP_EMPTY_SET;
>>   cred->cap_bset = CAP_FULL_SET;
>> + if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
>> + is_user_ns_controlled(user_ns->parent))
>> + mark_user_ns_controlled(user_ns);
>
> Hm, why do this here, rather than at create_user_ns()? It
> shouldn't be recalculated when

Re: [Patch net] vlan: fix a use-after-free in vlan_device_event()

2017-11-09 Thread Girish Moodalbail


On 11/9/17 4:43 PM, Cong Wang wrote:

After refcnt reaches zero, vlan_vid_del() could free
dev->vlan_info via RCU:

RCU_INIT_POINTER(dev->vlan_info, NULL);
call_rcu(_info->rcu, vlan_info_rcu_free);

However, the pointer 'grp' still points to that memory
since it is set before vlan_vid_del():

 vlan_info = rtnl_dereference(dev->vlan_info);
 if (!vlan_info)
 goto out;
 grp = _info->grp;

Depends on when that RCU callback is scheduled, we could
trigger a use-after-free in vlan_group_for_each_dev()
right following this vlan_vid_del().

Fix it by moving vlan_vid_del() before setting grp. This
is also symmetric to the vlan_vid_add() we call in
vlan_device_event().

Reported-by: Fengguang Wu 
Fixes: efc73f4bbc23 ("net: Fix memory leak - vlan_info struct")
Cc: Alexander Duyck 
Cc: Linus Torvalds 
Cc: Girish Moodalbail 
Signed-off-by: Cong Wang 


LGTM.

Reviewed-by: Girish Moodalbail 

Thanks,
~Girish



---
  net/8021q/vlan.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..4a72ee4e2ae9 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -376,6 +376,9 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
dev->name);
vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
}
+   if (event == NETDEV_DOWN &&
+   (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
  
  	vlan_info = rtnl_dereference(dev->vlan_info);

if (!vlan_info)
@@ -423,9 +426,6 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
struct net_device *tmp;
LIST_HEAD(close_list);
  
-		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)

-   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
-
/* Put all VLANs for this dev in the down state too.  */
vlan_group_for_each_dev(grp, i, vlandev) {
flgs = vlandev->flags;

Re: [PATCH] tcp: Export to userspace the TCP state names for the trace events

2017-11-09 Thread Song Liu


> On Nov 9, 2017, at 4:57 PM, Steven Rostedt  wrote:
> 
> 
> From: "Steven Rostedt (VMware)" 
> 
> The TCP trace events (specifically tcp_set_state), maps emums to symbol
> names via __print_symbolic(). But this only works for reading trace events
> from the tracefs trace files. If perf or trace-cmd were to record these
> events, the event format file does not convert the enum names into numbers,
> and you get something like:
> 
> __print_symbolic(REC->oldstate,
>{ TCP_ESTABLISHED, "TCP_ESTABLISHED" },
>{ TCP_SYN_SENT, "TCP_SYN_SENT" },
>{ TCP_SYN_RECV, "TCP_SYN_RECV" },
>{ TCP_FIN_WAIT1, "TCP_FIN_WAIT1" },
>{ TCP_FIN_WAIT2, "TCP_FIN_WAIT2" },
>{ TCP_TIME_WAIT, "TCP_TIME_WAIT" },
>{ TCP_CLOSE, "TCP_CLOSE" },
>{ TCP_CLOSE_WAIT, "TCP_CLOSE_WAIT" },
>{ TCP_LAST_ACK, "TCP_LAST_ACK" },
>{ TCP_LISTEN, "TCP_LISTEN" },
>{ TCP_CLOSING, "TCP_CLOSING" },
>{ TCP_NEW_SYN_RECV, "TCP_NEW_SYN_RECV" })
> 
> Where trace-cmd and perf do not know the values of those enums.
> 
> Use the TRACE_DEFINE_ENUM() macros that will have the trace events convert
> the enum strings into their values at system boot. This will allow perf and
> trace-cmd to see actual numbers and not enums:
> 
> __print_symbolic(REC->oldstate,
>{ 1, "TCP_ESTABLISHED" },
>{ 2, "TCP_SYN_SENT" },
>{ 3, "TCP_SYN_RECV" },
>{ 4, "TCP_FIN_WAIT1" },
>{ 5, "TCP_FIN_WAIT2" },
>{ 6, "TCP_TIME_WAIT" },
>{ 7, "TCP_CLOSE" },
>{ 8, "TCP_CLOSE_WAIT" },
>{ 9, "TCP_LAST_ACK" },
>{ 10, "TCP_LISTEN" },
>{ 11, "TCP_CLOSING" },
>{ 12, "TCP_NEW_SYN_RECV" })
> 
> Signed-off-by: Steven Rostedt (VMware) 
> ---
> include/trace/events/tcp.h | 41 -
> 1 file changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 07a6cbf1..62e5bad7901f 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -9,21 +9,36 @@
> #include 
> #include 
> 
> +#define tcp_state_names  \
> + EM(TCP_ESTABLISHED) \
> + EM(TCP_SYN_SENT)\
> + EM(TCP_SYN_RECV)\
> + EM(TCP_FIN_WAIT1)   \
> + EM(TCP_FIN_WAIT2)   \
> + EM(TCP_TIME_WAIT)   \
> + EM(TCP_CLOSE)   \
> + EM(TCP_CLOSE_WAIT)  \
> + EM(TCP_LAST_ACK)\
> + EM(TCP_LISTEN)  \
> + EM(TCP_CLOSING) \
> + EMe(TCP_NEW_SYN_RECV)
> +
> +/* enums need to be exported to user space */
> +#undef EM
> +#undef EMe
> +#define EM(a) TRACE_DEFINE_ENUM(a);
> +#define EMe(a)TRACE_DEFINE_ENUM(a);
> +
> +tcp_state_names
> +
> +#undef EM
> +#undef EMe
> +#define EM(a) tcp_state_name(a),
> +#define EMe(a)tcp_state_name(a)
> +
> #define tcp_state_name(state) { state, #state }
> #define show_tcp_state_name(val)  \
> - __print_symbolic(val,   \
> - tcp_state_name(TCP_ESTABLISHED),\
> - tcp_state_name(TCP_SYN_SENT),   \
> - tcp_state_name(TCP_SYN_RECV),   \
> - tcp_state_name(TCP_FIN_WAIT1),  \
> - tcp_state_name(TCP_FIN_WAIT2),  \
> - tcp_state_name(TCP_TIME_WAIT),  \
> - tcp_state_name(TCP_CLOSE),  \
> - tcp_state_name(TCP_CLOSE_WAIT), \
> - tcp_state_name(TCP_LAST_ACK),   \
> - tcp_state_name(TCP_LISTEN), \
> - tcp_state_name(TCP_CLOSING),\
> - tcp_state_name(TCP_NEW_SYN_RECV))
> + __print_symbolic(val, tcp_state_names)
> 
> /*
>  * tcp event with arguments sk and skb
> -- 
> 2.13.6
> 

Reviewed-and-tested-by: Song Liu

[PATCH net-next] tcp: Namespace-ify sysctl_tcp_default_congestion_control

2017-11-09 Thread Stephen Hemminger

Make default TCP default congestion control to a per namespace
value. The congestion control setting of new namespaces is inherited
from the root namespace. Modules are only autoloaded in the root namespace.

Signed-off-by: Stephen Hemminger 
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h  |  6 ++--
 net/ipv4/fib_semantics.c   |  4 +--
 net/ipv4/sysctl_net_ipv4.c | 19 ++-
 net/ipv4/tcp_cong.c| 78 ++
 net/ipv4/tcp_ipv4.c|  9 ++
 net/ipv6/route.c   |  3 +-
 7 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 379550f8124a..23ddfcfc8afe 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -155,6 +155,7 @@ struct netns_ipv4 {
int sysctl_tcp_invalid_ratelimit;
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
+   const struct tcp_congestion_ops __rcu  *tcp_congestion_control;
struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index babfd4da1515..64d4099d41da 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1005,8 +1005,8 @@ void tcp_unregister_congestion_control(struct 
tcp_congestion_ops *type);
 void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
-int tcp_set_default_congestion_control(const char *name);
-void tcp_get_default_congestion_control(char *name);
+int tcp_set_default_congestion_control(struct net *net, const char *name);
+void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
@@ -1020,7 +1020,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 
acked);
 extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
 #ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
 #else
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 589caaa90613..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct 
fib_info *fi)
bool ecn_ca = false;
 
nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(tmp, _ca);
+   val = tcp_ca_get_key_by_name(fi->fib_net, tmp, _ca);
} else {
val = nla_get_u32(nla);
}
@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct 
fib_config *cfg)
char tmp[TCP_CA_NAME_MAX];
 
nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(tmp, _ca);
+   val = tcp_ca_get_key_by_name(fi->fib_net, tmp, _ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a82b44038308..c97d9e614017 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, 
int write,
 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
   void __user *buffer, size_t *lenp, 
loff_t *ppos)
 {
+   struct net *net = container_of(ctl->data, struct net,
+  ipv4.tcp_congestion_control);
char val[TCP_CA_NAME_MAX];
struct ctl_table tbl = {
.data = val,
@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table 
*ctl, int write,
};
int ret;
 
-   tcp_get_default_congestion_control(val);
+   tcp_get_default_congestion_control(net, val);
 
ret = proc_dostring(, write, buffer, lenp, ppos);
if (write && ret == 0)
-   ret = tcp_set_default_congestion_control(val);
+   ret = tcp_set_default_congestion_control(net, val);
return ret;
 }
 
@@ -463,12 +465,6 @@ static struct ctl_table ipv4_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
-   {
-   .procname   = "tcp_congestion_control",
-   .mode   = 0644,
-   .maxlen = TCP_CA_NAME_MAX,
-   .proc_handler   = proc_tcp_congestion_control,

Re: [PATCH net-next 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Florian Fainelli

On 11/09/2017 05:06 PM, Andrew Lunn wrote:
> On Thu, Nov 09, 2017 at 03:03:16PM -0800, Florian Fainelli wrote:
>> On 11/09/2017 02:37 PM, Andrew Lunn wrote:
>>> On Thu, Nov 09, 2017 at 02:26:04PM -0800, Florian Fainelli wrote:
 In preparation for enabling Broadcom tags with b53, pad packets to a
 minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
 accept ingressing frames. Without this, we would typically be able to
 DHCP, but not resolve with ARP because packets are too small and get
 rejected by the switch.
>>>
>>> Hi Florian
>>>
>>> Is the MAC sending runt packets in its default configuration? Is this
>>> a general issue, and not just an issue when there is a switch directly
>>> attached?
>>
>> The MAC is sending 64 bytes (with FCS) padded packets by default, but
>> this apparently gets mis-calculated when Broadcom tags are enabled, such
>> that we need to pad before to avoid that.
> 
> Hi Florian
> 
> Ah, so maybe when the tag is stripped off it then becomes a runt
> packet and so gets dropped.

Yes, that is exactly what I observed.
-- 
Florian

Per-CPU Queueing for QoS

2017-11-09 Thread Michael Ma

Currently txq/qdisc selection is based on flow hash so packets from
the same flow will follow the order when they enter qdisc/txq, which
avoids out-of-order problem.

To improve the concurrency of QoS algorithm we plan to have multiple
per-cpu queues for a single TC class and do busy polling from a
per-class thread to drain these queues. If we can do this frequently
enough the out-of-order situation in this polling thread should not be
that bad.

To give more details - in the send path we introduce per-cpu per-class
queues so that packets from the same class and same core will be
enqueued to the same place. Then a per-class thread poll the queues
belonging to its class from all the cpus and aggregate them into
another per-class queue. This can effectively reduce contention but
inevitably introduces potential out-of-order issue.

Any concern/suggestion for working towards this direction?

Re: [PATCH net-next 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Andrew Lunn

On Thu, Nov 09, 2017 at 03:03:16PM -0800, Florian Fainelli wrote:
> On 11/09/2017 02:37 PM, Andrew Lunn wrote:
> > On Thu, Nov 09, 2017 at 02:26:04PM -0800, Florian Fainelli wrote:
> >> In preparation for enabling Broadcom tags with b53, pad packets to a
> >> minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
> >> accept ingressing frames. Without this, we would typically be able to
> >> DHCP, but not resolve with ARP because packets are too small and get
> >> rejected by the switch.
> > 
> > Hi Florian
> > 
> > Is the MAC sending runt packets in its default configuration? Is this
> > a general issue, and not just an issue when there is a switch directly
> > attached?
> 
> The MAC is sending 64 bytes (with FCS) padded packets by default, but
> this apparently gets mis-calculated when Broadcom tags are enabled, such
> that we need to pad before to avoid that.

Hi Florian

Ah, so maybe when the tag is stripped off it then becomes a runt
packet and so gets dropped.

   Andrew

[PATCH] tcp: Export to userspace the TCP state names for the trace events

2017-11-09 Thread Steven Rostedt


From: "Steven Rostedt (VMware)" 

The TCP trace events (specifically tcp_set_state), maps emums to symbol
names via __print_symbolic(). But this only works for reading trace events
from the tracefs trace files. If perf or trace-cmd were to record these
events, the event format file does not convert the enum names into numbers,
and you get something like:

__print_symbolic(REC->oldstate,
{ TCP_ESTABLISHED, "TCP_ESTABLISHED" },
{ TCP_SYN_SENT, "TCP_SYN_SENT" },
{ TCP_SYN_RECV, "TCP_SYN_RECV" },
{ TCP_FIN_WAIT1, "TCP_FIN_WAIT1" },
{ TCP_FIN_WAIT2, "TCP_FIN_WAIT2" },
{ TCP_TIME_WAIT, "TCP_TIME_WAIT" },
{ TCP_CLOSE, "TCP_CLOSE" },
{ TCP_CLOSE_WAIT, "TCP_CLOSE_WAIT" },
{ TCP_LAST_ACK, "TCP_LAST_ACK" },
{ TCP_LISTEN, "TCP_LISTEN" },
{ TCP_CLOSING, "TCP_CLOSING" },
{ TCP_NEW_SYN_RECV, "TCP_NEW_SYN_RECV" })

Where trace-cmd and perf do not know the values of those enums.

Use the TRACE_DEFINE_ENUM() macros that will have the trace events convert
the enum strings into their values at system boot. This will allow perf and
trace-cmd to see actual numbers and not enums:

__print_symbolic(REC->oldstate,
{ 1, "TCP_ESTABLISHED" },
{ 2, "TCP_SYN_SENT" },
{ 3, "TCP_SYN_RECV" },
{ 4, "TCP_FIN_WAIT1" },
{ 5, "TCP_FIN_WAIT2" },
{ 6, "TCP_TIME_WAIT" },
{ 7, "TCP_CLOSE" },
{ 8, "TCP_CLOSE_WAIT" },
{ 9, "TCP_LAST_ACK" },
{ 10, "TCP_LISTEN" },
{ 11, "TCP_CLOSING" },
{ 12, "TCP_NEW_SYN_RECV" })

Signed-off-by: Steven Rostedt (VMware) 
---
 include/trace/events/tcp.h | 41 -
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07a6cbf1..62e5bad7901f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,21 +9,36 @@
 #include 
 #include 
 
+#define tcp_state_names\
+   EM(TCP_ESTABLISHED) \
+   EM(TCP_SYN_SENT)\
+   EM(TCP_SYN_RECV)\
+   EM(TCP_FIN_WAIT1)   \
+   EM(TCP_FIN_WAIT2)   \
+   EM(TCP_TIME_WAIT)   \
+   EM(TCP_CLOSE)   \
+   EM(TCP_CLOSE_WAIT)  \
+   EM(TCP_LAST_ACK)\
+   EM(TCP_LISTEN)  \
+   EM(TCP_CLOSING) \
+   EMe(TCP_NEW_SYN_RECV)
+
+/* enums need to be exported to user space */
+#undef EM
+#undef EMe
+#define EM(a) TRACE_DEFINE_ENUM(a);
+#define EMe(a)TRACE_DEFINE_ENUM(a);
+
+tcp_state_names
+
+#undef EM
+#undef EMe
+#define EM(a) tcp_state_name(a),
+#define EMe(a)tcp_state_name(a)
+
 #define tcp_state_name(state)  { state, #state }
 #define show_tcp_state_name(val)   \
-   __print_symbolic(val,   \
-   tcp_state_name(TCP_ESTABLISHED),\
-   tcp_state_name(TCP_SYN_SENT),   \
-   tcp_state_name(TCP_SYN_RECV),   \
-   tcp_state_name(TCP_FIN_WAIT1),  \
-   tcp_state_name(TCP_FIN_WAIT2),  \
-   tcp_state_name(TCP_TIME_WAIT),  \
-   tcp_state_name(TCP_CLOSE),  \
-   tcp_state_name(TCP_CLOSE_WAIT), \
-   tcp_state_name(TCP_LAST_ACK),   \
-   tcp_state_name(TCP_LISTEN), \
-   tcp_state_name(TCP_CLOSING),\
-   tcp_state_name(TCP_NEW_SYN_RECV))
+   __print_symbolic(val, tcp_state_names)
 
 /*
  * tcp event with arguments sk and skb
-- 
2.13.6

Re: [PATCH] ipvlan: fix ipv6 outbound device

2017-11-09 Thread liuqifa

That's a good question.
Recently, I am researching how to use ipvlan, I found when using L3 mode, the 
packet from
the ipvlan slave interface can only get out from the phy device which the slave 
interface attached,
and I found this is assigning the flowi4's output device result, so I tried to 
don't assign the
output device, I found the packet from the ipvlan slave interface can be 
forwarded to another
phy device, but it has bringing some other problem.
At the same time, when I read the code, I found the process of ipv6 packet is 
different, so I think
this is a bug.
I am look forward to listening to your advice!

---
On Thu, Nov 9, 2017 at 9:09 PM,   wrote:
> From: Keefe Liu 
>
> When process the outbound packet of ipv6, we should assign the master 
> device to output device other than input device.
>
curious to know, how you discovered this?

> Signed-off-by: Keefe Liu 
Acked-by: Mahesh Bandewar 
> ---
>  drivers/net/ipvlan/ipvlan_core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ipvlan/ipvlan_core.c 
> b/drivers/net/ipvlan/ipvlan_core.c
> index 034ae4c..f2a7e92 100644
> --- a/drivers/net/ipvlan/ipvlan_core.c
> +++ b/drivers/net/ipvlan/ipvlan_core.c
> @@ -409,7 +409,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
> struct dst_entry *dst;
> int err, ret = NET_XMIT_DROP;
> struct flowi6 fl6 = {
> -   .flowi6_iif = dev->ifindex,
> +   .flowi6_oif = dev->ifindex,
> .daddr = ip6h->daddr,
> .saddr = ip6h->saddr,
> .flowi6_flags = FLOWI_FLAG_ANYSRC,
> --
> 1.8.3.1
>
>

Re: [PATCH net-next] bpf: add support for SO_PRIORITY in bpf_getsockopt

2017-11-09 Thread Alexei Starovoitov


On 11/10/17 8:04 AM, Vlad Dumitrescu wrote:

From: Vlad Dumitrescu 

Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.

Signed-off-by: Vlad Dumitrescu 
---
 net/core/filter.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 1afa17935954..61c791f9f628 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3292,8 +3292,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
if (!sk_fullsock(sk))
goto err_clear;

+   if (level == SOL_SOCKET) {
+   if (optlen != sizeof(int))
+   goto err_clear;
+
+   switch (optname) {
+   case SO_PRIORITY:
+   *((int *)optval) = sk->sk_priority;


would be cleaner to add sk_priority to 'struct bpf_sock_ops' instead.
Faster runtime too.

Re: [PATCH] net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition

2017-11-09 Thread Steven Rostedt

On Thu, 9 Nov 2017 23:40:13 +
Song Liu  wrote:

> > tcp_set_state uses __print_symbolic to show state in text format. I found
> > trace-cmd cannot parse that part:
> > 
> > [011] 147338.660560: tcp_set_state:sport=16262 dport=48346 \
> >saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \
> >daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate=

The latest trace-cmd does show oldstate=0xa newstate=0x7, since I fixed
it so undefined symbols and flags are displayed.

> > 
> > Other parts of the output looks good to me.
> > 
> > Thanks,
> > Song  
> 
> I am not sure whether this is the best approach, but the following patch 
> fixes the output of perf:

No it's not the best approach. But the below patch is ;-)

> 
>  0.44%  sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \
> saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \
> oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK
> 

I'll send a formal patch if you all approve.

-- Steve

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07a6cbf1..62e5bad7901f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,21 +9,36 @@
 #include 
 #include 
 
+#define tcp_state_names\
+   EM(TCP_ESTABLISHED) \
+   EM(TCP_SYN_SENT)\
+   EM(TCP_SYN_RECV)\
+   EM(TCP_FIN_WAIT1)   \
+   EM(TCP_FIN_WAIT2)   \
+   EM(TCP_TIME_WAIT)   \
+   EM(TCP_CLOSE)   \
+   EM(TCP_CLOSE_WAIT)  \
+   EM(TCP_LAST_ACK)\
+   EM(TCP_LISTEN)  \
+   EM(TCP_CLOSING) \
+   EMe(TCP_NEW_SYN_RECV)
+
+/* enums need to be exported to user space */
+#undef EM
+#undef EMe
+#define EM(a) TRACE_DEFINE_ENUM(a);
+#define EMe(a)TRACE_DEFINE_ENUM(a);
+
+tcp_state_names
+
+#undef EM
+#undef EMe
+#define EM(a) tcp_state_name(a),
+#define EMe(a)tcp_state_name(a)
+
 #define tcp_state_name(state)  { state, #state }
 #define show_tcp_state_name(val)   \
-   __print_symbolic(val,   \
-   tcp_state_name(TCP_ESTABLISHED),\
-   tcp_state_name(TCP_SYN_SENT),   \
-   tcp_state_name(TCP_SYN_RECV),   \
-   tcp_state_name(TCP_FIN_WAIT1),  \
-   tcp_state_name(TCP_FIN_WAIT2),  \
-   tcp_state_name(TCP_TIME_WAIT),  \
-   tcp_state_name(TCP_CLOSE),  \
-   tcp_state_name(TCP_CLOSE_WAIT), \
-   tcp_state_name(TCP_LAST_ACK),   \
-   tcp_state_name(TCP_LISTEN), \
-   tcp_state_name(TCP_CLOSING),\
-   tcp_state_name(TCP_NEW_SYN_RECV))
+   __print_symbolic(val, tcp_state_names)
 
 /*
  * tcp event with arguments sk and skb

[Patch net] vlan: fix a use-after-free in vlan_device_event()

2017-11-09 Thread Cong Wang

After refcnt reaches zero, vlan_vid_del() could free
dev->vlan_info via RCU:

RCU_INIT_POINTER(dev->vlan_info, NULL);
call_rcu(_info->rcu, vlan_info_rcu_free);

However, the pointer 'grp' still points to that memory
since it is set before vlan_vid_del():

vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
goto out;
grp = _info->grp;

Depends on when that RCU callback is scheduled, we could
trigger a use-after-free in vlan_group_for_each_dev()
right following this vlan_vid_del().

Fix it by moving vlan_vid_del() before setting grp. This
is also symmetric to the vlan_vid_add() we call in
vlan_device_event().

Reported-by: Fengguang Wu 
Fixes: efc73f4bbc23 ("net: Fix memory leak - vlan_info struct")
Cc: Alexander Duyck 
Cc: Linus Torvalds 
Cc: Girish Moodalbail 
Signed-off-by: Cong Wang 
---
 net/8021q/vlan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..4a72ee4e2ae9 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -376,6 +376,9 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
dev->name);
vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
}
+   if (event == NETDEV_DOWN &&
+   (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
 
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
@@ -423,9 +426,6 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
struct net_device *tmp;
LIST_HEAD(close_list);
 
-   if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
-   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
-
/* Put all VLANs for this dev in the down state too.  */
vlan_group_for_each_dev(grp, i, vlandev) {
flgs = vlandev->flags;
-- 
2.13.0

Re: [vlan_device_event] BUG: unable to handle kernel paging request at 6b6b6ccf

2017-11-09 Thread Cong Wang

On Thu, Nov 9, 2017 at 7:51 AM, Girish Moodalbail
 wrote:
>
> Upon receiving NETDEV_DOWN event, we are calling
>
> vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
>
> which in turn calls call_rcu() to queue vlan_info_free_rcu() to be called at
> some point. This free function frees the array[]
> (vlan_info.vlan_grp.vn_devices_array).  My guess is that
> vlan_info_free_rcu() is being called first and then the array[] is being
> accessed in vlan_device_event().
>

Well yes and no.

No, RCU itself is not broken and we clearly unpublish the RCU pointer
before calling call_rcu().

Yes, I see where it is broken: the grp pointer still points to old
dev->vlan_info, we should re-fetch it after vlan_vid_del().

I will send a fix.

Thanks!

Re: [Patch net 05/13] cls_cgroup: use tcf_exts_get_net() before call_rcu()

2017-11-09 Thread Cong Wang

On Thu, Nov 9, 2017 at 2:31 PM, Roman Mashak  wrote:
> Cong Wang  writes:
>> @@ -124,8 +130,10 @@ static int cls_cgroup_change(struct net *net, struct 
>> sk_buff *in_skb,
>>   goto errout;
>>
>>   rcu_assign_pointer(tp->root, new);
>> - if (head)
>> + if (head) {
>> + tcf_exts_get_net(>exts);
>>   call_rcu(>rcu, cls_cgroup_destroy_rcu);
>
> In this case why do you not need to care about success/failure of
> tcf_exts_get_net() ?

The answer is right in the changelog you omitted... Quoted below:

"Note, on ->destroy() path we have to respect the return value
of tcf_exts_get_net(), on other paths it should always return
true, so we don't need to care."

Re: [PATCH] net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition

2017-11-09 Thread Song Liu


> On Nov 9, 2017, at 10:34 AM, Song Liu  wrote:
> 
>> 
>> On Nov 9, 2017, at 10:18 AM, Steven Rostedt  wrote:
>> 
>> On Thu, 9 Nov 2017 15:43:29 +0900
>> Alexei Starovoitov  wrote:
>> 
 +TRACE_EVENT(tcp_set_state,
 +  TP_PROTO(struct sock *sk, int oldstate, int newstate),
 +  TP_ARGS(sk, oldstate, newstate),
 +
 +  TP_STRUCT__entry(
 +  __field(__be32, dst)
 +  __field(__be32, src)
 +  __field(__u16, dport)
 +  __field(__u16, sport)
 +  __field(int, oldstate)
 +  __field(int, newstate)
 +  ),
 +
 +  TP_fast_assign(
 +  if (oldstate == TCP_TIME_WAIT) {
 +  __entry->dst = inet_twsk(sk)->tw_daddr;
 +  __entry->src = inet_twsk(sk)->tw_rcv_saddr;
 +  __entry->dport = ntohs(inet_twsk(sk)->tw_dport);
 +  __entry->sport = ntohs(inet_twsk(sk)->tw_sport);
 +  } else if (oldstate == TCP_NEW_SYN_RECV) {
 +  __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
 +  __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
 +  __entry->dport =
 +  ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
 +  __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
 +  } else {
 +  __entry->dst = inet_sk(sk)->inet_daddr;
 +  __entry->src = inet_sk(sk)->inet_rcv_saddr;
 +  __entry->dport = ntohs(inet_sk(sk)->inet_dport);
 +  __entry->sport = ntohs(inet_sk(sk)->inet_sport);
 +  }
 +
 +  __entry->oldstate = oldstate;
 +  __entry->newstate = newstate;
 +  ),
 +
 +  TP_printk("%08X:%04X %08X:%04X, %02x %02x",
 +  __entry->src, __entry->sport, __entry->dst, __entry->dport,
 +  __entry->oldstate, __entry->newstate)  
>>> 
>>> direct %x of state is not allowed.
>>> This has to use show_tcp_state_name() like it's done in trace_tcp_set_state
>> 
>> Hmm, I need to look at trace_tcp_set_state. I'm guessing it is in
>> net-next too?
> 
> Yes, in net-next. There are 6 tracepoints under tcp group:
> 
>  tcp_destroy_sock  
>  tcp_receive_reset  
>  tcp_retransmit_skb  
>  tcp_retransmit_synack  
>  tcp_send_reset  
>  tcp_set_state
> 
> They are all added recently.
> 
>> 
>>> 
>>> Also I'm missing the reason to introduce another tracepoint
>>> that looks just like trace_tcp_set_state.
>> 
>> I want to make sure that perf and trace-cmd can parse the TP_printk(),
>> if it is having helper functions like that in the TP_printk() part,
>> then the libtraceevent needs to be aware of that.
>> 
> 
> tcp_set_state uses __print_symbolic to show state in text format. I found
> trace-cmd cannot parse that part:
> 
> [011] 147338.660560: tcp_set_state:sport=16262 dport=48346 \
>saddr=127.0.0.6 daddr=127.0.0.6 saddrv6=2401:db00:30:317e:face:0:1f:0 \
>daddrv6=2401:db00:30:31e5:face:0:7f:0 oldstate= newstate=
> 
> Other parts of the output looks good to me.
> 
> Thanks,
> Song

I am not sure whether this is the best approach, but the following patch 
fixes the output of perf:

 0.44%  sport=16262 dport=39362 saddr=127.0.0.6 daddr=127.0.0.6 \
saddrv6=2401:db00:30:317e:face:0:1f:0 daddrv6=2401:db00:30:31e5:face:0:7f:0 \
oldstate=TCP_CLOSE_WAIT newstate=TCP_LAST_ACK

Thanks,
Song


>From 4b7e27631a4c7df96a38223a95ee3ede2f5f2d19 Mon Sep 17 00:00:00 2001
From: Song Liu 
Date: Thu, 9 Nov 2017 15:30:07 -0800
Subject: [PATCH] libtraceevent: add flags for tcp state names

Names of TCP states are added to flags in event-parse.c.

The names are used to print symbolic names in tracepoint:
tcp/tcp_set_state.

Signed-off-by: Song Liu 
---
 tools/lib/traceevent/event-parse.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/tools/lib/traceevent/event-parse.c 
b/tools/lib/traceevent/event-parse.c
index 7ce724f..4972dc2 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3790,6 +3790,20 @@ static const struct flag flags[] = {

{ "HRTIMER_NORESTART", 0 },
{ "HRTIMER_RESTART", 1 },
+
+   /* tcp state names, see include/net/tcp_states.h */
+   { "TCP_ESTABLISHED", 1 },
+   { "TCP_SYN_SENT", 2 },
+   { "TCP_SYN_RECV", 3 },
+   { "TCP_FIN_WAIT1", 4 },
+   { "TCP_FIN_WAIT2", 5 },
+   { "TCP_TIME_WAIT", 6 },
+   { "TCP_CLOSE", 7 },
+   { "TCP_CLOSE_WAIT", 8 },
+   { "TCP_LAST_ACK", 9 },
+   { "TCP_LISTEN", 10 },
+   { "TCP_CLOSING", 11 },
+   { "TCP_NEW_SYN_RECV", 12 },
 };

 static long long eval_flag(const char *flag)
--
2.9.5

[PATCH v2 0/6] wl1251: Fix MAC address for Nokia N900

2017-11-09 Thread Pali Rohár

This patch series fix processing MAC address for wl1251 chip found in Nokia 
N900.

Changes since v1:
* Added Acked-by for Pavel Machek
* Fixed grammar
* Magic numbers for NVS offsets are replaced by defines
* Check for validity of mac address NVS data is moved into function
* Changed order of patches as Pavel requested

Pali Rohár (6):
  wl1251: Update wl->nvs_len after wl->nvs is valid
  wl1251: Generate random MAC address only if driver does not have
valid
  wl1251: Parse and use MAC address from supplied NVS data
  wl1251: Set generated MAC address back to NVS data
  firmware: Add request_firmware_prefer_user() function
  wl1251: Use request_firmware_prefer_user() for loading NVS
calibration data

 drivers/base/firmware_class.c  |   45 +-
 drivers/net/wireless/ti/wl1251/Kconfig |1 +
 drivers/net/wireless/ti/wl1251/main.c  |  104 ++--
 include/linux/firmware.h   |9 +++
 4 files changed, 138 insertions(+), 21 deletions(-)

-- 
1.7.9.5

[PATCH v2 2/6] wl1251: Generate random MAC address only if driver does not have valid

2017-11-09 Thread Pali Rohár

Before this patch, driver generated random MAC address every time it was
initialized. After that random MAC address could be overwritten with fixed
one, if provided.

This patch changes order. First it tries to read fixed MAC address and if
it fails then driver generates random MAC address.

Signed-off-by: Pali Rohár 
Acked-by: Pavel Machek 
---
 drivers/net/wireless/ti/wl1251/main.c |   27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ti/wl1251/main.c 
b/drivers/net/wireless/ti/wl1251/main.c
index 8929bb3..9106c20 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1492,7 +1492,24 @@ int wl1251_init_ieee80211(struct wl1251 *wl)
wl->hw->queues = 4;
 
if (wl->use_eeprom)
-   wl1251_read_eeprom_mac(wl);
+   ret = wl1251_read_eeprom_mac(wl);
+   else
+   ret = -EINVAL;
+
+   if (ret == 0 && !is_valid_ether_addr(wl->mac_addr))
+   ret = -EINVAL;
+
+   if (ret < 0) {
+   /*
+* In case our MAC address is not correctly set,
+* we use a random but Nokia MAC.
+*/
+   static const u8 nokia_oui[3] = {0x00, 0x1f, 0xdf};
+   memcpy(wl->mac_addr, nokia_oui, 3);
+   get_random_bytes(wl->mac_addr + 3, 3);
+   wl1251_warning("MAC address in eeprom or nvs data is not 
valid");
+   wl1251_warning("Setting random MAC address: %pM", wl->mac_addr);
+   }
 
ret = wl1251_register_hw(wl);
if (ret)
@@ -1513,7 +1530,6 @@ struct ieee80211_hw *wl1251_alloc_hw(void)
struct ieee80211_hw *hw;
struct wl1251 *wl;
int i;
-   static const u8 nokia_oui[3] = {0x00, 0x1f, 0xdf};
 
hw = ieee80211_alloc_hw(sizeof(*wl), _ops);
if (!hw) {
@@ -1563,13 +1579,6 @@ struct ieee80211_hw *wl1251_alloc_hw(void)
INIT_WORK(>irq_work, wl1251_irq_work);
INIT_WORK(>tx_work, wl1251_tx_work);
 
-   /*
-* In case our MAC address is not correctly set,
-* we use a random but Nokia MAC.
-*/
-   memcpy(wl->mac_addr, nokia_oui, 3);
-   get_random_bytes(wl->mac_addr + 3, 3);
-
wl->state = WL1251_STATE_OFF;
mutex_init(>mutex);
spin_lock_init(>wl_lock);
-- 
1.7.9.5

[PATCH v2 1/6] wl1251: Update wl->nvs_len after wl->nvs is valid

2017-11-09 Thread Pali Rohár

If kmemdup fails, then wl->nvs_len will contain invalid non-zero size.

Signed-off-by: Pali Rohár 
Acked-by: Pavel Machek 
---
 drivers/net/wireless/ti/wl1251/main.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ti/wl1251/main.c 
b/drivers/net/wireless/ti/wl1251/main.c
index 9915d83..8929bb3 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -122,8 +122,7 @@ static int wl1251_fetch_nvs(struct wl1251 *wl)
goto out;
}
 
-   wl->nvs_len = fw->size;
-   wl->nvs = kmemdup(fw->data, wl->nvs_len, GFP_KERNEL);
+   wl->nvs = kmemdup(fw->data, fw->size, GFP_KERNEL);
 
if (!wl->nvs) {
wl1251_error("could not allocate memory for the nvs file");
@@ -131,6 +130,8 @@ static int wl1251_fetch_nvs(struct wl1251 *wl)
goto out;
}
 
+   wl->nvs_len = fw->size;
+
ret = 0;
 
 out:
-- 
1.7.9.5

[PATCH v2 5/6] firmware: Add request_firmware_prefer_user() function

2017-11-09 Thread Pali Rohár

This function works pretty much like request_firmware(), but it prefer
usermode helper. If usermode helper fails then it fallback to direct
access. Useful for dynamic or model specific firmware data.

Signed-off-by: Pali Rohár 
---
 drivers/base/firmware_class.c |   45 +++--
 include/linux/firmware.h  |9 +
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 4b57cf5..c3a9fe5 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -195,6 +195,11 @@ static int __fw_state_check(struct fw_state *fw_st, enum 
fw_status status)
 #endif
 #define FW_OPT_NO_WARN (1U << 3)
 #define FW_OPT_NOCACHE (1U << 4)
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+#define FW_OPT_PREFER_USER (1U << 5)
+#else
+#define FW_OPT_PREFER_USER 0
+#endif
 
 struct firmware_cache {
/* firmware_buf instance will be added into the below list */
@@ -1214,13 +1219,26 @@ static void fw_abort_batch_reqs(struct firmware *fw)
if (ret <= 0) /* error or already assigned */
goto out;
 
-   ret = fw_get_filesystem_firmware(device, fw->priv);
+   if (opt_flags & FW_OPT_PREFER_USER) {
+   ret = fw_load_from_user_helper(fw, name, device, opt_flags, 
timeout);
+   if (ret && !(opt_flags & FW_OPT_NO_WARN)) {
+   dev_warn(device,
+"User helper firmware load for %s failed with 
error %d\n",
+name, ret);
+   dev_warn(device, "Falling back to direct firmware 
load\n");
+   }
+   } else {
+   ret = -EINVAL;
+   }
+
+   if (ret)
+   ret = fw_get_filesystem_firmware(device, fw->priv);
if (ret) {
if (!(opt_flags & FW_OPT_NO_WARN))
dev_warn(device,
 "Direct firmware load for %s failed with error 
%d\n",
 name, ret);
-   if (opt_flags & FW_OPT_USERHELPER) {
+   if ((opt_flags & FW_OPT_USERHELPER) && !(opt_flags & 
FW_OPT_PREFER_USER)) {
dev_warn(device, "Falling back to user helper\n");
ret = fw_load_from_user_helper(fw, name, device,
   opt_flags);
@@ -1329,6 +1347,29 @@ int request_firmware_direct(const struct firmware 
**firmware_p,
 EXPORT_SYMBOL(request_firmware_into_buf);
 
 /**
+ * request_firmware_prefer_user: - prefer usermode helper for loading firmware
+ * @firmware_p: pointer to firmware image
+ * @name: name of firmware file
+ * @device: device for which firmware is being loaded
+ *
+ * This function works pretty much like request_firmware(), but it prefer
+ * usermode helper. If usermode helper fails then it fallback to direct access.
+ * Useful for dynamic or model specific firmware data.
+ **/
+int request_firmware_prefer_user(const struct firmware **firmware_p,
+   const char *name, struct device *device)
+{
+   int ret;
+
+   __module_get(THIS_MODULE);
+   ret = _request_firmware(firmware_p, name, device, NULL, 0,
+   FW_OPT_UEVENT | FW_OPT_PREFER_USER);
+   module_put(THIS_MODULE);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(request_firmware_prefer_user);
+
+/**
  * release_firmware: - release the resource associated with a firmware image
  * @fw: firmware resource to release
  **/
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index d450808..8584528 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -48,6 +48,8 @@ int request_firmware_nowait(
void (*cont)(const struct firmware *fw, void *context));
 int request_firmware_direct(const struct firmware **fw, const char *name,
struct device *device);
+int request_firmware_prefer_user(const struct firmware **fw, const char *name,
+struct device *device);
 int request_firmware_into_buf(const struct firmware **firmware_p,
const char *name, struct device *device, void *buf, size_t size);
 
@@ -78,6 +80,13 @@ static inline int request_firmware_direct(const struct 
firmware **fw,
return -EINVAL;
 }
 
+static inline int request_firmware_prefer_user(const struct firmware **fw,
+  const char *name,
+  struct device *device)
+{
+   return -EINVAL;
+}
+
 static inline int request_firmware_into_buf(const struct firmware **firmware_p,
const char *name, struct device *device, void *buf, size_t size)
 {
-- 
1.7.9.5

[PATCH v2 4/6] wl1251: Set generated MAC address back to NVS data

2017-11-09 Thread Pali Rohár

In case there is no valid MAC address kernel generates random one. This
patch propagate this generated MAC address back to NVS data which will be
uploaded to wl1251 chip. So HW would have same MAC address as linux kernel
uses.

This should not change any functionality, but it is better to tell wl1251
correct mac address since beginning of chip usage.

Signed-off-by: Pali Rohár 
---
 drivers/net/wireless/ti/wl1251/main.c |   17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/wireless/ti/wl1251/main.c 
b/drivers/net/wireless/ti/wl1251/main.c
index d497ba5..1f423be 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1481,6 +1481,21 @@ static int wl1251_read_nvs_mac(struct wl1251 *wl)
return 0;
 }
 
+static int wl1251_write_nvs_mac(struct wl1251 *wl)
+{
+   int i, ret;
+
+   ret = wl1251_check_nvs_mac(wl);
+   if (ret)
+   return ret;
+
+   /* MAC is stored in reverse order */
+   for (i = 0; i < ETH_ALEN; i++)
+   wl->nvs[NVS_OFF_MAC_DATA + i] = wl->mac_addr[ETH_ALEN - i - 1];
+
+   return 0;
+}
+
 static int wl1251_register_hw(struct wl1251 *wl)
 {
int ret;
@@ -1546,6 +1561,8 @@ int wl1251_init_ieee80211(struct wl1251 *wl)
static const u8 nokia_oui[3] = {0x00, 0x1f, 0xdf};
memcpy(wl->mac_addr, nokia_oui, 3);
get_random_bytes(wl->mac_addr + 3, 3);
+   if (!wl->use_eeprom)
+   wl1251_write_nvs_mac(wl);
wl1251_warning("MAC address in eeprom or nvs data is not 
valid");
wl1251_warning("Setting random MAC address: %pM", wl->mac_addr);
}
-- 
1.7.9.5

[PATCH v2 3/6] wl1251: Parse and use MAC address from supplied NVS data

2017-11-09 Thread Pali Rohár

This patch implements parsing MAC address from NVS data which are sent to
wl1251 chip. Calibration NVS data could contain valid MAC address and it
will be used instead of randomly generated one.

This patch also moves code for requesting NVS data from userspace to driver
initialization code to make sure that NVS data will be there at time when
permanent MAC address is needed.

Calibration NVS data for wl1251 are device specific. Every device with
wl1251 chip should have been calibrated in factory and needs to provide own
calibration data.

Default example file wl1251-nvs.bin, found in linux-firmware repository,
contains MAC address 00:00:20:07:03:09. So this MAC address is marked as
invalid as it is not real device specific address, just example one.

Format of calibration NVS data can be found at:
http://notaz.gp2x.de/misc/pnd/wl1251/nvs_map.txt

Signed-off-by: Pali Rohár 
---
 drivers/net/wireless/ti/wl1251/main.c |   55 -
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ti/wl1251/main.c 
b/drivers/net/wireless/ti/wl1251/main.c
index 9106c20..d497ba5 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -203,13 +203,6 @@ static int wl1251_chip_wakeup(struct wl1251 *wl)
goto out;
}
 
-   if (wl->nvs == NULL && !wl->use_eeprom) {
-   /* No NVS from netlink, try to get it from the filesystem */
-   ret = wl1251_fetch_nvs(wl);
-   if (ret < 0)
-   goto out;
-   }
-
 out:
return ret;
 }
@@ -1448,6 +1441,46 @@ static int wl1251_read_eeprom_mac(struct wl1251 *wl)
return 0;
 }
 
+#define NVS_OFF_MAC_LEN 0x19
+#define NVS_OFF_MAC_ADDR_LO 0x1a
+#define NVS_OFF_MAC_ADDR_HI 0x1b
+#define NVS_OFF_MAC_DATA 0x1c
+
+static int wl1251_check_nvs_mac(struct wl1251 *wl)
+{
+   if (wl->nvs_len < 0x24)
+   return -ENODATA;
+
+   /* length is 2 and data address is 0x546c (ANDed with 0xfffe) */
+   if (wl->nvs[NVS_OFF_MAC_LEN] != 2 ||
+   wl->nvs[NVS_OFF_MAC_ADDR_LO] != 0x6d ||
+   wl->nvs[NVS_OFF_MAC_ADDR_HI] != 0x54)
+   return -EINVAL;
+
+   return 0;
+}
+
+static int wl1251_read_nvs_mac(struct wl1251 *wl)
+{
+   u8 mac[ETH_ALEN];
+   int i, ret;
+
+   ret = wl1251_check_nvs_mac(wl);
+   if (ret)
+   return ret;
+
+   /* MAC is stored in reverse order */
+   for (i = 0; i < ETH_ALEN; i++)
+   mac[i] = wl->nvs[NVS_OFF_MAC_DATA + ETH_ALEN - i - 1];
+
+   /* 00:00:20:07:03:09 is in example file wl1251-nvs.bin, so invalid */
+   if (ether_addr_equal_unaligned(mac, "\x00\x00\x20\x07\x03\x09"))
+   return -EINVAL;
+
+   memcpy(wl->mac_addr, mac, ETH_ALEN);
+   return 0;
+}
+
 static int wl1251_register_hw(struct wl1251 *wl)
 {
int ret;
@@ -1491,10 +1524,16 @@ int wl1251_init_ieee80211(struct wl1251 *wl)
 
wl->hw->queues = 4;
 
+   if (wl->nvs == NULL && !wl->use_eeprom) {
+   ret = wl1251_fetch_nvs(wl);
+   if (ret < 0)
+   goto out;
+   }
+
if (wl->use_eeprom)
ret = wl1251_read_eeprom_mac(wl);
else
-   ret = -EINVAL;
+   ret = wl1251_read_nvs_mac(wl);
 
if (ret == 0 && !is_valid_ether_addr(wl->mac_addr))
ret = -EINVAL;
-- 
1.7.9.5

[PATCH v2 6/6] wl1251: Use request_firmware_prefer_user() for loading NVS calibration data

2017-11-09 Thread Pali Rohár

NVS calibration data for wl1251 are model specific. Every one device with
wl1251 chip has different and calibrated in factory.

Not all wl1251 chips have own EEPROM where are calibration data stored. And
in that case there is no "standard" place. Every device has stored them on
different place (some in rootfs file, some in dedicated nand partition,
some in another proprietary structure).

Kernel wl1251 driver cannot support every one different storage decided by
device manufacture so it will use request_firmware_prefer_user() call for
loading NVS calibration data and userspace helper will be responsible to
prepare correct data.

In case userspace helper fails request_firmware_prefer_user() still try to
load data file directly from VFS as fallback mechanism.

On Nokia N900 device, which has wl1251 chip, NVS calibration data are
stored in CAL nand partition. CAL is proprietary Nokia key/value format for
nand devices.

With this patch it is finally possible to load correct model specific NVS
calibration data for Nokia N900.

Userspace tool for reading NVS calibration data on Nokia N900 is available
in git repository at: https://github.com/community-ssu/wl1251-cal

Signed-off-by: Pali Rohár 
---
 drivers/net/wireless/ti/wl1251/Kconfig |1 +
 drivers/net/wireless/ti/wl1251/main.c  |2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ti/wl1251/Kconfig 
b/drivers/net/wireless/ti/wl1251/Kconfig
index 7142ccf..affe154 100644
--- a/drivers/net/wireless/ti/wl1251/Kconfig
+++ b/drivers/net/wireless/ti/wl1251/Kconfig
@@ -2,6 +2,7 @@ config WL1251
tristate "TI wl1251 driver support"
depends on MAC80211
select FW_LOADER
+   select FW_LOADER_USER_HELPER
select CRC7
---help---
  This will enable TI wl1251 driver support. The drivers make
diff --git a/drivers/net/wireless/ti/wl1251/main.c 
b/drivers/net/wireless/ti/wl1251/main.c
index 1f423be..e9d232c 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -108,7 +108,7 @@ static int wl1251_fetch_nvs(struct wl1251 *wl)
struct device *dev = wiphy_dev(wl->hw->wiphy);
int ret;
 
-   ret = request_firmware(, WL1251_NVS_NAME, dev);
+   ret = request_firmware_prefer_user(, WL1251_NVS_NAME, dev);
 
if (ret < 0) {
wl1251_error("could not get nvs file: %d", ret);
-- 
1.7.9.5

[PATCH net-next] net: dsa: mv88e6xxx: Fix stats histogram mode

2017-11-09 Thread Andrew Lunn

The statistics histogram mode was not being explicitly initialized on
devices other than the 6390 family. Clearing the statistics then
overwrote the default setting, setting the histogram to a reserved
mode.

Explicitly set the histogram mode for all devices. Change the
statistics clear into a read/modify/write, and since it is now more
complex, move it into global1.c.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c| 34 +-
 drivers/net/dsa/mv88e6xxx/global1.c | 35 +++
 drivers/net/dsa/mv88e6xxx/global1.h |  2 ++
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6dd5fdfeafcf..d6c3a22c8789 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2004,19 +2004,7 @@ static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip 
*chip)
if (err)
return err;
 
-   /* Clear the statistics counters for all ports */
-   err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_STATS_OP,
-MV88E6XXX_G1_STATS_OP_BUSY |
-MV88E6XXX_G1_STATS_OP_FLUSH_ALL);
-   if (err)
-   return err;
-
-   /* Wait for the flush to complete. */
-   err = mv88e6xxx_g1_stats_wait(chip);
-   if (err)
-   return err;
-
-   return 0;
+   return mv88e6xxx_g1_stats_clear(chip);
 }
 
 static int mv88e6xxx_setup(struct dsa_switch *ds)
@@ -2299,6 +2287,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
.port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
@@ -2326,6 +2315,7 @@ static const struct mv88e6xxx_ops mv88e6095_ops = {
.port_set_egress_floods = mv88e6185_port_set_egress_floods,
.port_set_upstream_port = mv88e6095_port_set_upstream_port,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
@@ -2356,6 +2346,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
.port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
.port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
@@ -2383,6 +2374,7 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
.port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
.port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
@@ -2413,6 +2405,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
.port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
.port_pause_limit = mv88e6097_port_pause_limit,
.stats_snapshot = mv88e6xxx_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6095_stats_get_sset_count,
.stats_get_strings = mv88e6095_stats_get_strings,
.stats_get_stats = mv88e6095_stats_get_stats,
@@ -2449,6 +2442,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
.port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
.port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
.stats_snapshot = mv88e6390_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count = mv88e6320_stats_get_sset_count,
.stats_get_strings = mv88e6320_stats_get_strings,
.stats_get_stats = mv88e6390_stats_get_stats,
@@ -2481,6 +2475,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
.port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
.port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
.stats_snapshot = mv88e6320_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
.stats_get_sset_count

linux-next: manual merge of the net-next tree with Linus' tree

2017-11-09 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  net/sched/cls_basic.c
  net/sched/cls_u32.c

between commits:

  0b2a59894b76 ("cls_basic: use tcf_exts_get_net() before call_rcu()")
  35c55fc156d8 ("cls_u32: use tcf_exts_get_net() before call_rcu()")

from Linus' tree and commit:

  1d8134fea2eb ("net_sched: use idr to allocate basic filter handles")

from the net-next tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
This is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc net/sched/cls_basic.c
index e43c56d5b96a,871351358c10..
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@@ -118,11 -115,10 +120,13 @@@ static void basic_destroy(struct tcf_pr
list_for_each_entry_safe(f, n, >flist, link) {
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
+   idr_remove_ext(>handle_idr, f->handle);
 -  call_rcu(>rcu, basic_delete_filter);
 +  if (tcf_exts_get_net(>exts))
 +  call_rcu(>rcu, basic_delete_filter);
 +  else
 +  __basic_delete_filter(f);
}
+   idr_destroy(>handle_idr);
kfree_rcu(head, rcu);
  }
  
@@@ -133,7 -129,7 +137,8 @@@ static int basic_delete(struct tcf_prot
  
list_del_rcu(>link);
tcf_unbind_filter(tp, >res);
 +  tcf_exts_get_net(>exts);
+   idr_remove_ext(>handle_idr, f->handle);
call_rcu(>rcu, basic_delete_filter);
*last = list_empty(>flist);
return 0;
@@@ -226,9 -222,9 +231,10 @@@ static int basic_change(struct net *net
*arg = fnew;
  
if (fold) {
+   idr_replace_ext(>handle_idr, fnew, fnew->handle);
list_replace_rcu(>link, >link);
tcf_unbind_filter(tp, >res);
 +  tcf_exts_get_net(>exts);
call_rcu(>rcu, basic_delete_filter);
} else {
list_add_rcu(>link, >flist);
diff --cc net/sched/cls_u32.c
index b58eccb21f03,2737b71854c9..
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@@ -590,10 -589,8 +591,11 @@@ static void u32_clear_hnode(struct tcf_
 rtnl_dereference(n->next));
tcf_unbind_filter(tp, >res);
u32_remove_hw_knode(tp, n->handle);
+   idr_remove_ext(>handle_idr, n->handle);
 -  call_rcu(>rcu, u32_delete_key_freepf_rcu);
 +  if (tcf_exts_get_net(>exts))
 +  call_rcu(>rcu, u32_delete_key_freepf_rcu);
 +  else
 +  u32_destroy_key(n->tp, n, true);
}
}
  }

[PATCH net-next] bpf: add support for SO_PRIORITY in bpf_getsockopt

2017-11-09 Thread Vlad Dumitrescu

From: Vlad Dumitrescu 

Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.

Signed-off-by: Vlad Dumitrescu 
---
 net/core/filter.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 1afa17935954..61c791f9f628 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3292,8 +3292,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
if (!sk_fullsock(sk))
goto err_clear;
 
+   if (level == SOL_SOCKET) {
+   if (optlen != sizeof(int))
+   goto err_clear;
+
+   switch (optname) {
+   case SO_PRIORITY:
+   *((int *)optval) = sk->sk_priority;
+   break;
+   default:
+   goto err_clear;
+   }
 #ifdef CONFIG_INET
-   if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+   } else if (level == SOL_TCP &&
+  sk->sk_prot->getsockopt == tcp_getsockopt) {
if (optname == TCP_CONGESTION) {
struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -3304,11 +3316,11 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
} else {
goto err_clear;
}
+#endif
} else {
goto err_clear;
}
return 0;
-#endif
 err_clear:
memset(optval, 0, optlen);
return -EINVAL;
-- 
2.15.0.448.gf294e3d99a-goog

Re: [PATCH net-next 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Florian Fainelli

On 11/09/2017 02:37 PM, Andrew Lunn wrote:
> On Thu, Nov 09, 2017 at 02:26:04PM -0800, Florian Fainelli wrote:
>> In preparation for enabling Broadcom tags with b53, pad packets to a
>> minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
>> accept ingressing frames. Without this, we would typically be able to
>> DHCP, but not resolve with ARP because packets are too small and get
>> rejected by the switch.
> 
> Hi Florian
> 
> Is the MAC sending runt packets in its default configuration? Is this
> a general issue, and not just an issue when there is a switch directly
> attached?

The MAC is sending 64 bytes (with FCS) padded packets by default, but
this apparently gets mis-calculated when Broadcom tags are enabled, such
that we need to pad before to avoid that.
-- 
Florian

[PATCH] uprobes/x86: emulate push insns for uprobe on x86

2017-11-09 Thread Yonghong Song

Uprobe is a tracing mechanism for userspace programs.
Typical uprobe will incur overhead of two traps.
First trap is caused by replaced trap insn, and
the second trap is to execute the original displaced
insn in user space.

To reduce the overhead, kernel provides hooks
for architectures to emulate the original insn
and skip the second trap. In x86, emulation
is done for certain branch insns.

This patch extends the emulation to "push "
insns. These insns are typical in the beginning
of the function. For example, bcc
in https://github.com/iovisor/bcc repo provides
tools to measure funclantency, detect memleak, etc.
The tools will place uprobes in the beginning of
function and possibly uretprobes at the end of function.
This patch is able to reduce the trap overhead for
uprobe from 2 to 1.

Without this patch, uretprobe will typically incur
three traps. With this patch, if the function starts
with "push" insn, the number of traps can be
reduced from 3 to 2.

An experiment was conducted on two local VMs,
fedora 26 64-bit VM and 32-bit VM, both 4 processors
and 4GB memory, booted with latest tip repo (and this patch).
The host is MacBook with intel i7 processor.

The test program looks like
  #include 
  #include 
  #include 
  #include 

  static void test() __attribute__((noinline));
  void test() {}
  int main() {
struct timeval start, end;

gettimeofday(, NULL);
for (int i = 0; i < 100; i++) {
  test();
}
gettimeofday(, NULL);

printf("%ld\n", ((end.tv_sec * 100 + end.tv_usec)
 - (start.tv_sec * 100 + start.tv_usec)));
return 0;
  }

The program is compiled without optimization, and
the first insn for function "test" is "push %rbp".
The host is relatively idle.

Before the test run, the uprobe is inserted as below for uprobe:
  echo 'p :' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
and for uretprobe:
  echo 'r :' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable

Unit: microsecond(usec) per loop iteration

x86_64  W/ this patch   W/O this patch
uprobe  1.553.1
uretprobe   2.0 3.6

x86_32  W/ this patch   W/O this patch
uprobe  1.413.5
uretprobe   1.754.0

You can see that this patch significantly reduced the overhead,
50% for uprobe and 44% for uretprobe on x86_64, and even more
on x86_32.

Signed-off-by: Yonghong Song 
---
 arch/x86/include/asm/uprobes.h |   4 ++
 arch/x86/kernel/uprobes.c  | 110 ++---
 2 files changed, 107 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 74f4c2f..a90090c 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -53,6 +53,10 @@ struct arch_uprobe {
u8  fixups;
u8  ilen;
}   defparam;
+   struct {
+   u8  src_offset; /* to the start of pt_regs */
+   u8  ilen;
+   }   push;
};
 };
 
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a3755d2..1ee8b59 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe 
*auprobe, struct pt_regs *regs)
return 0;
 }
 
-static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
 {
unsigned long new_sp = regs->sp - sizeof_long();
 
-   if (copy_to_user((void __user *)new_sp, , sizeof_long()))
+   if (copy_to_user((void __user *)new_sp, , sizeof_long()))
return -EFAULT;
 
regs->sp = new_sp;
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, 
struct pt_regs *regs
regs->ip += correction;
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
regs->sp += sizeof_long(); /* Pop incorrect return address */
-   if (push_ret_address(regs, utask->vaddr + 
auprobe->defparam.ilen))
+   if (emulate_push_stack(regs, utask->vaddr + 
auprobe->defparam.ilen))
return -ERESTART;
}
/* popf; tell the caller to not touch TF */
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, 
struct pt_regs *regs)
 *
 * But there is corner case, see the comment in ->post_xol().
 */
-   if (push_ret_address(regs, new_ip))
+   if (emulate_push_stack(regs, new_ip))
return false;
} else if (!check_jmp_cond(auprobe, regs)) {
offs = 0;
@@ -665,6 +665,16 @@ static bool

Re: [PATCH net-next 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Andrew Lunn

On Thu, Nov 09, 2017 at 02:26:04PM -0800, Florian Fainelli wrote:
> In preparation for enabling Broadcom tags with b53, pad packets to a
> minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
> accept ingressing frames. Without this, we would typically be able to
> DHCP, but not resolve with ARP because packets are too small and get
> rejected by the switch.

Hi Florian

Is the MAC sending runt packets in its default configuration? Is this
a general issue, and not just an issue when there is a switch directly
attached?

Thanks
Andrew

[PATCH net-next v2 2/3] net: dsa: b53: Stop using dev->cpu_port incorrectly

2017-11-09 Thread Florian Fainelli

dev->cpu_port is the driver local information that should only be used
to look up register offsets for a particular port, when they differ
(e.g: IMP port override), but it should certainly not be used in place
of the DSA configured CPU port.

Since the DSA switch layer calls port_vlan_{add,del}() on the CPU port
as well, we can remove the specific setting of the CPU port within
port_vlan_{add,del}.

Fixes: ff39c2d68679 ("net: dsa: b53: Add bridge support")
Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch")
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index a7ca62ba27b7..17f12484ce24 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -505,7 +505,7 @@ EXPORT_SYMBOL(b53_imp_vlan_setup);
 int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
struct b53_device *dev = ds->priv;
-   unsigned int cpu_port = dev->cpu_port;
+   unsigned int cpu_port = ds->ports[port].cpu_dp->index;
u16 pvlan;
 
/* Clear the Rx and Tx disable bits and set to no spanning tree */
@@ -1054,7 +1054,6 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
struct b53_device *dev = ds->priv;
bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
-   unsigned int cpu_port = dev->cpu_port;
struct b53_vlan *vl;
u16 vid;
 
@@ -1063,12 +1062,11 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
 
b53_get_vlan_entry(dev, vid, vl);
 
-   vl->members |= BIT(port) | BIT(cpu_port);
+   vl->members |= BIT(port);
if (untagged)
vl->untag |= BIT(port);
else
vl->untag &= ~BIT(port);
-   vl->untag &= ~BIT(cpu_port);
 
b53_set_vlan_entry(dev, vid, vl);
b53_fast_age_vlan(dev, vid);
@@ -1432,8 +1430,8 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct 
net_device *br)
b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg);
} else {
b53_get_vlan_entry(dev, pvid, vl);
-   vl->members |= BIT(port) | BIT(dev->cpu_port);
-   vl->untag |= BIT(port) | BIT(dev->cpu_port);
+   vl->members |= BIT(port) | BIT(cpu_port);
+   vl->untag |= BIT(port) | BIT(cpu_port);
b53_set_vlan_entry(dev, pvid, vl);
}
 }
-- 
2.9.3

[PATCH net-next v2 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Florian Fainelli

In preparation for enabling Broadcom tags with b53, pad packets to a
minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
accept ingressing frames. Without this, we would typically be able to
DHCP, but not resolve with ARP because packets are too small and get
rejected by the switch.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 48d672b204a4..5130fc96940d 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -127,6 +127,8 @@ bgmac_dma_tx_add_buf(struct bgmac *bgmac, struct 
bgmac_dma_ring *ring,
dma_desc->ctl1 = cpu_to_le32(ctl1);
 }
 
+#define ENET_BRCM_TAG_LEN  4
+
 static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
struct bgmac_dma_ring *ring,
struct sk_buff *skb)
@@ -139,6 +141,16 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
u32 flags;
int i;
 
+   /* The Ethernet switch we are interfaced with needs packets to be at
+* least 64 bytes (including FCS) otherwise they will be discarded when
+* they enter the switch port logic. When Broadcom tags are enabled, we
+* need to make sure that packets are at least 68 bytes
+* (including FCS and tag) because the length verification is done after
+* the Broadcom tag is stripped off the ingress packet.
+*/
+   if (skb_put_padto(skb, ETH_ZLEN + ENET_BRCM_TAG_LEN))
+   goto err_stats;
+
if (skb->len > BGMAC_DESC_CTL1_LEN) {
netdev_err(bgmac->net_dev, "Too long skb (%d)\n", skb->len);
goto err_drop;
@@ -225,6 +237,7 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
 
 err_drop:
dev_kfree_skb(skb);
+err_stats:
net_dev->stats.tx_dropped++;
net_dev->stats.tx_errors++;
return NETDEV_TX_OK;
-- 
2.9.3

[PATCH net-next v2 3/3] net: dsa: b53: Turn on Broadcom tags

2017-11-09 Thread Florian Fainelli

Enable Broadcom tags for b53 devices, except 5325 and 5365 which use a
different Broadcom tag format not yet supported by net/dsa/tag_brcm.c.

We also make sure that we can turn on Broadcom tags on a CPU port number
that is capable of that: 5, 7 or 8.

Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 48 
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 17f12484ce24..44a9a03bff55 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -325,7 +325,6 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 
vid,
 
 static void b53_set_forwarding(struct b53_device *dev, int enable)
 {
-   struct dsa_switch *ds = dev->ds;
u8 mgmt;
 
b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, );
@@ -337,14 +336,11 @@ static void b53_set_forwarding(struct b53_device *dev, 
int enable)
 
b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
 
-   /* Include IMP port in dumb forwarding mode when no tagging protocol is
-* set
+   /* Include IMP port in dumb forwarding mode
 */
-   if (ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_NONE) {
-   b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, );
-   mgmt |= B53_MII_DUMB_FWDG_EN;
-   b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
-   }
+   b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, );
+   mgmt |= B53_MII_DUMB_FWDG_EN;
+   b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
 }
 
 static void b53_enable_vlan(struct b53_device *dev, bool enable)
@@ -612,6 +608,8 @@ static void b53_enable_cpu_port(struct b53_device *dev, int 
port)
PORT_CTRL_RX_MCST_EN |
PORT_CTRL_RX_UCST_EN;
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl);
+
+   b53_brcm_hdr_setup(dev->ds, port);
 }
 
 static void b53_enable_mib(struct b53_device *dev)
@@ -1480,9 +1478,41 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_br_fast_age);
 
+static bool b53_can_enable_brcm_tags(struct dsa_switch *ds)
+{
+   unsigned int brcm_tag_mask;
+   unsigned int i;
+
+   /* Broadcom switches will accept enabling Broadcom tags on the
+* following ports: 5, 7 and 8, any other port is not supported
+*/
+   brcm_tag_mask = BIT(B53_CPU_PORT_25) | BIT(7) | BIT(B53_CPU_PORT);
+
+   for (i = 0; i < ds->num_ports; i++) {
+   if (dsa_is_cpu_port(ds, i)) {
+   if (!(BIT(i) & brcm_tag_mask)) {
+   dev_warn(ds->dev,
+"Port %d is not Broadcom tag 
capable\n",
+i);
+   return false;
+   }
+   }
+   }
+
+   return true;
+}
+
 static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds)
 {
-   return DSA_TAG_PROTO_NONE;
+   struct b53_device *dev = ds->priv;
+
+   /* Older models support a different tag format that we do not
+* support in net/dsa/tag_brcm.c yet.
+*/
+   if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds))
+   return DSA_TAG_PROTO_NONE;
+   else
+   return DSA_TAG_PROTO_BRCM;
 }
 
 int b53_mirror_add(struct dsa_switch *ds, int port,
-- 
2.9.3

[PATCH net-next v2 0/3] net: dsa: b53: Turn on Broadcom tags

2017-11-09 Thread Florian Fainelli

Hi all,

This was long overdue, with this patch series, the b53 driver now
turns on Broadcom tags except for 5325 and 5365 which use an older
format that we do not support yet (TBD).

First patch is necessary in order for bgmac, used on BCM5301X and Northstar
Plus to work correctly and successfully send ARP packets back to the requsester.

Second patch is actually a bug fix, but because net/master and net-next/master
diverge in that area, I am targeting net-next/master here.

Finally, the last patch enables Broadcom tags after checking that the CPU port
selected is either, 5, 7 or 8, since those are the only valid combinations
given currently supported HW.

Florian Fainelli (3):
  net: bgmac: Pad packets to a minimum size
  net: dsa: b53: Stop using dev->cpu_port incorrectly
  net: dsa: b53: Turn on Broadcom tags

 drivers/net/dsa/b53/b53_common.c  | 58 ++-
 drivers/net/ethernet/broadcom/bgmac.c | 13 
 2 files changed, 56 insertions(+), 15 deletions(-)

-- 
2.9.3

Re: [Patch net 05/13] cls_cgroup: use tcf_exts_get_net() before call_rcu()

2017-11-09 Thread Roman Mashak

Cong Wang  writes:


[...]

>  static void cls_cgroup_destroy_work(struct work_struct *work)
>  {
>   struct cls_cgroup_head *head = container_of(work,
>   struct cls_cgroup_head,
>   work);
>   rtnl_lock();
> - tcf_exts_destroy(>exts);
> - tcf_em_tree_destroy(>ematches);
> - kfree(head);
> + __cls_cgroup_destroy(head);
>   rtnl_unlock();
>  }
>  
> @@ -124,8 +130,10 @@ static int cls_cgroup_change(struct net *net, struct 
> sk_buff *in_skb,
>   goto errout;
>  
>   rcu_assign_pointer(tp->root, new);
> - if (head)
> + if (head) {
> + tcf_exts_get_net(>exts);
>   call_rcu(>rcu, cls_cgroup_destroy_rcu);

In this case why do you not need to care about success/failure of
tcf_exts_get_net() ?

> + }
>   return 0;
>  errout:
>   tcf_exts_destroy(>exts);
> @@ -138,8 +146,12 @@ static void cls_cgroup_destroy(struct tcf_proto *tp)
>   struct cls_cgroup_head *head = rtnl_dereference(tp->root);
>  
>   /* Head can still be NULL due to cls_cgroup_init(). */
> - if (head)
> - call_rcu(>rcu, cls_cgroup_destroy_rcu);
> + if (head) {
> + if (tcf_exts_get_net(>exts))
> + call_rcu(>rcu, cls_cgroup_destroy_rcu);
> + else
> + __cls_cgroup_destroy(head);
> + }
>  }
>  
>  static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last)

Re: [PATCH v3 7/8] netdev: octeon-ethernet: Add Cavium Octeon III support.

2017-11-09 Thread Andrew Lunn

> + if (link_changed != 0) {
> + struct port_status status;
> +
> + if (link_changed > 0) {
> + netdev_info(netdev, "Link is up - %d/%s\n",
> + priv->phydev->speed,
> + priv->phydev->duplex == DUPLEX_FULL ?
> + "Full" : "Half");
> + } else {
> + netdev_info(netdev, "Link is down\n");
> + }

phy_print_status()

Andrew

Re: [PATCH net-next 0/3] net: dsa: b53: Turn on Broadcom tags

2017-11-09 Thread Florian Fainelli

On 11/09/2017 02:26 PM, Florian Fainelli wrote:
> Hi all,
> 
> This was long overdue, with this patch series, the b53 driver now
> turns on Broadcom tags except for 5325 and 5365 which use an older
> format that we do not support yet (TBD).
> 
> First patch is necessary in order for bgmac, used on BCM5301X and Northstar
> Plus to work correctly and successfully send ARP packets back to the 
> requsester.
> 
> Second patch is actually a bug fix, but because net/master and net-next/master
> diverge in that area, I am targeting net-next/master here.
> 
> Finally, the last patch enables Broadcom tags after checking that the CPU port
> selected is either, 5, 7 or 8, since those are the only valid combinations
> given currently supported HW.

David, I need to respin patch 2 and 3 since I squashed an incorrect hunk
into patch 2 which should have been in patch 3, thus breaking bisectability.
-- 
Florian

Re: [PATCH net-next 2/3] net: dsa: b53: Stop using dev->cpu_port incorrectly

2017-11-09 Thread Florian Fainelli

On 11/09/2017 02:26 PM, Florian Fainelli wrote:
> dev->cpu_port is the driver local information that should only be used
> to look up register offsets for a particular port, when they differ
> (e.g: IMP port override), but it should certainly not be used in place
> of the DSA configured CPU port.
> 
> Since the DSA switch layer calls port_vlan_{add,del}() on the CPU port
> as well, we can remove the specific setting of the CPU port within
> port_vlan_{add,del}.
> 
> Fixes: ff39c2d68679 ("net: dsa: b53: Add bridge support")
> Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch")
> Signed-off-by: Florian Fainelli 
> ---
>  drivers/net/dsa/b53/b53_common.c | 11 ---
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/dsa/b53/b53_common.c 
> b/drivers/net/dsa/b53/b53_common.c
> index a7ca62ba27b7..5e48e98b6187 100644
> --- a/drivers/net/dsa/b53/b53_common.c
> +++ b/drivers/net/dsa/b53/b53_common.c
> @@ -325,7 +325,6 @@ static void b53_get_vlan_entry(struct b53_device *dev, 
> u16 vid,
>  
>  static void b53_set_forwarding(struct b53_device *dev, int enable)
>  {
> - struct dsa_switch *ds = dev->ds;
>   u8 mgmt;

This hunk would break bisectability, let me respin something quickly
with that corrected.
-- 
Florian

[PATCH net-next 2/3] net: dsa: b53: Stop using dev->cpu_port incorrectly

2017-11-09 Thread Florian Fainelli

dev->cpu_port is the driver local information that should only be used
to look up register offsets for a particular port, when they differ
(e.g: IMP port override), but it should certainly not be used in place
of the DSA configured CPU port.

Since the DSA switch layer calls port_vlan_{add,del}() on the CPU port
as well, we can remove the specific setting of the CPU port within
port_vlan_{add,del}.

Fixes: ff39c2d68679 ("net: dsa: b53: Add bridge support")
Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch")
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index a7ca62ba27b7..5e48e98b6187 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -325,7 +325,6 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 
vid,
 
 static void b53_set_forwarding(struct b53_device *dev, int enable)
 {
-   struct dsa_switch *ds = dev->ds;
u8 mgmt;
 
b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, );
@@ -505,7 +504,7 @@ EXPORT_SYMBOL(b53_imp_vlan_setup);
 int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
struct b53_device *dev = ds->priv;
-   unsigned int cpu_port = dev->cpu_port;
+   unsigned int cpu_port = ds->ports[port].cpu_dp->index;
u16 pvlan;
 
/* Clear the Rx and Tx disable bits and set to no spanning tree */
@@ -1054,7 +1053,6 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
struct b53_device *dev = ds->priv;
bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
-   unsigned int cpu_port = dev->cpu_port;
struct b53_vlan *vl;
u16 vid;
 
@@ -1063,12 +1061,11 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
 
b53_get_vlan_entry(dev, vid, vl);
 
-   vl->members |= BIT(port) | BIT(cpu_port);
+   vl->members |= BIT(port);
if (untagged)
vl->untag |= BIT(port);
else
vl->untag &= ~BIT(port);
-   vl->untag &= ~BIT(cpu_port);
 
b53_set_vlan_entry(dev, vid, vl);
b53_fast_age_vlan(dev, vid);
@@ -1432,8 +1429,8 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct 
net_device *br)
b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg);
} else {
b53_get_vlan_entry(dev, pvid, vl);
-   vl->members |= BIT(port) | BIT(dev->cpu_port);
-   vl->untag |= BIT(port) | BIT(dev->cpu_port);
+   vl->members |= BIT(port) | BIT(cpu_port);
+   vl->untag |= BIT(port) | BIT(cpu_port);
b53_set_vlan_entry(dev, pvid, vl);
}
 }
-- 
2.9.3

[PATCH net-next 0/3] net: dsa: b53: Turn on Broadcom tags

2017-11-09 Thread Florian Fainelli

Hi all,

This was long overdue, with this patch series, the b53 driver now
turns on Broadcom tags except for 5325 and 5365 which use an older
format that we do not support yet (TBD).

First patch is necessary in order for bgmac, used on BCM5301X and Northstar
Plus to work correctly and successfully send ARP packets back to the requsester.

Second patch is actually a bug fix, but because net/master and net-next/master
diverge in that area, I am targeting net-next/master here.

Finally, the last patch enables Broadcom tags after checking that the CPU port
selected is either, 5, 7 or 8, since those are the only valid combinations
given currently supported HW.

Florian Fainelli (3):
  net: bgmac: Pad packets to a minimum size
  net: dsa: b53: Stop using dev->cpu_port incorrectly
  net: dsa: b53: Turn on Broadcom tags

 drivers/net/dsa/b53/b53_common.c  | 58 ++-
 drivers/net/ethernet/broadcom/bgmac.c | 13 
 2 files changed, 56 insertions(+), 15 deletions(-)

-- 
2.9.3

[PATCH net-next 3/3] net: dsa: b53: Turn on Broadcom tags

2017-11-09 Thread Florian Fainelli

Enable Broadcom tags for b53 devices, except 5325 and 5365 which use a
different Broadcom tag format not yet supported by net/dsa/tag_brcm.c.

We also make sure that we can turn on Broadcom tags on a CPU port number
that is capable of that: 5, 7 or 8.

Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 47 +---
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 5e48e98b6187..44a9a03bff55 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -336,14 +336,11 @@ static void b53_set_forwarding(struct b53_device *dev, 
int enable)
 
b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
 
-   /* Include IMP port in dumb forwarding mode when no tagging protocol is
-* set
+   /* Include IMP port in dumb forwarding mode
 */
-   if (ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_NONE) {
-   b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, );
-   mgmt |= B53_MII_DUMB_FWDG_EN;
-   b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
-   }
+   b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, );
+   mgmt |= B53_MII_DUMB_FWDG_EN;
+   b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
 }
 
 static void b53_enable_vlan(struct b53_device *dev, bool enable)
@@ -611,6 +608,8 @@ static void b53_enable_cpu_port(struct b53_device *dev, int 
port)
PORT_CTRL_RX_MCST_EN |
PORT_CTRL_RX_UCST_EN;
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl);
+
+   b53_brcm_hdr_setup(dev->ds, port);
 }
 
 static void b53_enable_mib(struct b53_device *dev)
@@ -1479,9 +1478,41 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_br_fast_age);
 
+static bool b53_can_enable_brcm_tags(struct dsa_switch *ds)
+{
+   unsigned int brcm_tag_mask;
+   unsigned int i;
+
+   /* Broadcom switches will accept enabling Broadcom tags on the
+* following ports: 5, 7 and 8, any other port is not supported
+*/
+   brcm_tag_mask = BIT(B53_CPU_PORT_25) | BIT(7) | BIT(B53_CPU_PORT);
+
+   for (i = 0; i < ds->num_ports; i++) {
+   if (dsa_is_cpu_port(ds, i)) {
+   if (!(BIT(i) & brcm_tag_mask)) {
+   dev_warn(ds->dev,
+"Port %d is not Broadcom tag 
capable\n",
+i);
+   return false;
+   }
+   }
+   }
+
+   return true;
+}
+
 static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds)
 {
-   return DSA_TAG_PROTO_NONE;
+   struct b53_device *dev = ds->priv;
+
+   /* Older models support a different tag format that we do not
+* support in net/dsa/tag_brcm.c yet.
+*/
+   if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds))
+   return DSA_TAG_PROTO_NONE;
+   else
+   return DSA_TAG_PROTO_BRCM;
 }
 
 int b53_mirror_add(struct dsa_switch *ds, int port,
-- 
2.9.3

[PATCH net-next 1/3] net: bgmac: Pad packets to a minimum size

2017-11-09 Thread Florian Fainelli

In preparation for enabling Broadcom tags with b53, pad packets to a
minimum size of 64 bytes (sans FCS) in order for the Broadcom switch to
accept ingressing frames. Without this, we would typically be able to
DHCP, but not resolve with ARP because packets are too small and get
rejected by the switch.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 48d672b204a4..5130fc96940d 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -127,6 +127,8 @@ bgmac_dma_tx_add_buf(struct bgmac *bgmac, struct 
bgmac_dma_ring *ring,
dma_desc->ctl1 = cpu_to_le32(ctl1);
 }
 
+#define ENET_BRCM_TAG_LEN  4
+
 static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
struct bgmac_dma_ring *ring,
struct sk_buff *skb)
@@ -139,6 +141,16 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
u32 flags;
int i;
 
+   /* The Ethernet switch we are interfaced with needs packets to be at
+* least 64 bytes (including FCS) otherwise they will be discarded when
+* they enter the switch port logic. When Broadcom tags are enabled, we
+* need to make sure that packets are at least 68 bytes
+* (including FCS and tag) because the length verification is done after
+* the Broadcom tag is stripped off the ingress packet.
+*/
+   if (skb_put_padto(skb, ETH_ZLEN + ENET_BRCM_TAG_LEN))
+   goto err_stats;
+
if (skb->len > BGMAC_DESC_CTL1_LEN) {
netdev_err(bgmac->net_dev, "Too long skb (%d)\n", skb->len);
goto err_drop;
@@ -225,6 +237,7 @@ static netdev_tx_t bgmac_dma_tx_add(struct bgmac *bgmac,
 
 err_drop:
dev_kfree_skb(skb);
+err_stats:
net_dev->stats.tx_dropped++;
net_dev->stats.tx_errors++;
return NETDEV_TX_OK;
-- 
2.9.3

Re: [PATCH v3 7/8] netdev: octeon-ethernet: Add Cavium Octeon III support.

2017-11-09 Thread Andrew Lunn

> + priv->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
> + priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
> + /* If phy-mode absent, default to SGMII. */
> + if (priv->phy_mode < 0)
> + priv->phy_mode = PHY_INTERFACE_MODE_SGMII;
> +
> + if (priv->phy_mode == PHY_INTERFACE_MODE_1000BASEX)
> + priv->mode_1000basex = true;
> +
> + if (of_phy_is_fixed_link(pdev->dev.of_node))
> + priv->bgx_as_phy = true;
> +

...

> + priv->mode = bgx_port_get_mode(priv->node, priv->bgx, priv->index);
> +

It might be a good idea to verify priv->phy_mode and priv->mode are
compatible.

> + switch (priv->mode) {
> + case PORT_MODE_SGMII:
> + case PORT_MODE_RGMII:
> + priv->get_link = bgx_port_get_sgmii_link;
> + priv->set_link = bgx_port_set_xgmii_link;
> + break;
> + case PORT_MODE_XAUI:
> + case PORT_MODE_RXAUI:
> + case PORT_MODE_XLAUI:
> + case PORT_MODE_XFI:
> + case PORT_MODE_10G_KR:
> + case PORT_MODE_40G_KR4:
> + priv->get_link = bgx_port_get_xaui_link;
> + priv->set_link = bgx_port_set_xaui_link;
> + break;


  Andrew

Re: [PATCH net-next v2] Revert "net: dsa: constify cpu_dp member of dsa_port"

2017-11-09 Thread Vivien Didelot

Hi David,

Vivien Didelot  writes:

> We may have to pass a port's dedicated CPU port to port-wide functions
> such as dsa_port_mdb_add, e.g. when offloading bridge device's MDB.
>
> The DSA port-wide functions currently take a non-const struct dsa_port *
> as first argument so we cannot make cpu_dp const yet, even though it'd
> be more correct. Revert this patch and make cpu_dp non-const again.
>
> This reverts commit 24a9332a58b7f41a0d36c35a2c6897242bffdbc0.

No need to apply this revert, Andrew made the dsa_port_mdb_* functions
take a const dsa_port in his new IGMP patchset so that we can keep the
cpu_dp const, as it should be.

Thanks,

Vivien

Re: [PATCH net-next] Revert "net: dsa: constify cpu_dp member of dsa_port"

2017-11-09 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

> And i will add the needed const attributes to these functions as part
> of the IGMP patchset.

Sounds good, not need for this revert then.


Thanks,

Vivien

Re: [PATCH v4 net-next 5/6] net: dsa: add more const attributes

2017-11-09 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

> The notify mechanism does not need to modify the port it is notifying.
> So make the parameter const.
>
> Signed-off-by: Andrew Lunn 

Reviewed-by: Vivien Didelot

[PATCH v4 net-next 0/6] IGMP snooping for local traffic

2017-11-09 Thread Andrew Lunn

The linux bridge supports IGMP snooping. It will listen to IGMP
reports on bridge ports and keep track of which groups have been
joined on an interface. It will then forward multicast based on this
group membership.

When the bridge adds or removed groups from an interface, it uses
switchdev to request the hardware add an mdb to a port, so the
hardware can perform the selective forwarding between ports.

What is not covered by the current bridge code, is IGMP joins/leaves
from the host on the brX interface. These are not reported via
switchdev so that hardware knows the local host is interested in the
multicast frames.

Luckily, the bridge does track joins/leaves on the brX interface. The
code is obfusticated, which is why i missed it with my first attempt.
So the first patch tries to remove this obfustication. Currently,
there is no notifications sent when the bridge interface joins a
group. The second patch adds them. bridge monitor then shows
joins/leaves in the same way as for other ports of the bridge.

Then starts the work passing down to the hardware that the host has
joined/left a group. The existing switchdev mdb object cannot be used,
since the semantics are different. The existing
SWITCHDEV_OBJ_ID_PORT_MDB is used to indicate a specific multicast
group should be forwarded out that port of the switch. However here we
require the exact opposite. We want multicast frames for the group
received on the port to the forwarded to the host. Hence add a new
object SWITCHDEV_OBJ_ID_HOST_MDB, a multicast database entry to
forward to the host. This new object is then propagated through the
DSA layers. No DSA driver changes should be needed, this should just
work...

This version fixes up the nitpick from Nikolay, removes an unrelated
white space change, and adds in a patch adding a few const attributes
to a couple of functions taking a port parameter, in order to stop the
following patch produces warnings.

Acked-by: Stephen Hemminger 

Andrew Lunn (6):
  net: bridge: Rename mglist to host_joined
  net: bridge: Send notification when host join/leaves a group
  net: bridge: Add/del switchdev object on host join/leave
  net: dsa: slave: Handle switchdev host mdb add/del
  net: dsa: add more const attributes
  net: dsa: switch: Don't add CPU port to an mdb by default

 include/net/switchdev.h   |  1 +
 net/bridge/br_input.c |  2 +-
 net/bridge/br_mdb.c   | 54 +++
 net/bridge/br_multicast.c | 18 ++--
 net/bridge/br_private.h   |  2 +-
 net/dsa/dsa_priv.h|  4 ++--
 net/dsa/port.c|  6 +++---
 net/dsa/slave.c   | 13 
 net/dsa/switch.c  |  2 +-
 net/switchdev/switchdev.c |  2 ++
 10 files changed, 85 insertions(+), 19 deletions(-)

-- 
2.15.0

[PATCH v4 net-next 1/6] net: bridge: Rename mglist to host_joined

2017-11-09 Thread Andrew Lunn

The boolean mglist indicates the host has joined a particular
multicast group on the bridge interface. It is badly named, obscuring
what is means. Rename it.

Signed-off-by: Andrew Lunn 
Acked-by: Nikolay Aleksandrov 
Acked-by: Florian Fainelli 
---
 net/bridge/br_input.c |  2 +-
 net/bridge/br_mdb.c   |  2 +-
 net/bridge/br_multicast.c | 14 +++---
 net/bridge/br_private.h   |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index a096d3e189da..7f98a7d25866 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -137,7 +137,7 @@ int br_handle_frame_finish(struct net *net, struct sock 
*sk, struct sk_buff *skb
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
-   if ((mdst && mdst->mglist) ||
+   if ((mdst && mdst->host_joined) ||
br_multicast_is_router(br)) {
local_rcv = true;
br->dev->stats.multicast++;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 31ddff22563e..aa716a33cb71 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -655,7 +655,7 @@ static int __br_mdb_del(struct net_bridge *br, struct 
br_mdb_entry *entry)
call_rcu_bh(>rcu, br_multicast_free_pg);
err = 0;
 
-   if (!mp->ports && !mp->mglist &&
+   if (!mp->ports && !mp->host_joined &&
netif_running(br->dev))
mod_timer(>timer, jiffies);
break;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 5f7f0e9d446c..bfe5adb1f51c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -249,7 +249,7 @@ static void br_multicast_group_expired(struct timer_list *t)
if (!netif_running(br->dev) || timer_pending(>timer))
goto out;
 
-   mp->mglist = false;
+   mp->host_joined = false;
 
if (mp->ports)
goto out;
@@ -292,7 +292,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
  p->flags);
call_rcu_bh(>rcu, br_multicast_free_pg);
 
-   if (!mp->ports && !mp->mglist &&
+   if (!mp->ports && !mp->host_joined &&
netif_running(br->dev))
mod_timer(>timer, jiffies);
 
@@ -773,7 +773,7 @@ static int br_multicast_add_group(struct net_bridge *br,
goto err;
 
if (!port) {
-   mp->mglist = true;
+   mp->host_joined = true;
mod_timer(>timer, now + br->multicast_membership_interval);
goto out;
}
@@ -1477,7 +1477,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 
max_delay *= br->multicast_last_member_count;
 
-   if (mp->mglist &&
+   if (mp->host_joined &&
(timer_pending(>timer) ?
 time_after(mp->timer.expires, now + max_delay) :
 try_to_del_timer_sync(>timer) >= 0))
@@ -1561,7 +1561,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
goto out;
 
max_delay *= br->multicast_last_member_count;
-   if (mp->mglist &&
+   if (mp->host_joined &&
(timer_pending(>timer) ?
 time_after(mp->timer.expires, now + max_delay) :
 try_to_del_timer_sync(>timer) >= 0))
@@ -1622,7 +1622,7 @@ br_multicast_leave_group(struct net_bridge *br,
br_mdb_notify(br->dev, port, group, RTM_DELMDB,
  p->flags);
 
-   if (!mp->ports && !mp->mglist &&
+   if (!mp->ports && !mp->host_joined &&
netif_running(br->dev))
mod_timer(>timer, jiffies);
}
@@ -1662,7 +1662,7 @@ br_multicast_leave_group(struct net_bridge *br,
 br->multicast_last_member_interval;
 
if (!port) {
-   if (mp->mglist &&
+   if (mp->host_joined &&
(timer_pending(>timer) ?
 time_after(mp->timer.expires, time) :
 try_to_del_timer_sync(>timer) >= 0)) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 40553d832b6e..1312b8d20ec3 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -209,7 +209,7 @@ struct net_bridge_mdb_entry
struct rcu_head rcu;
struct timer_list   timer;
struct br_ipaddr;
-   boolmglist;
+   boolhost_joined;
 };
 
 struct net_bridge_mdb_htable
-- 
2.15.0

[PATCH v4 net-next 3/6] net: bridge: Add/del switchdev object on host join/leave

2017-11-09 Thread Andrew Lunn

When the host joins or leaves a multicast group, use switchdev to add
an object to the hardware to forward traffic for the group to the
host.

Signed-off-by: Andrew Lunn 
Acked-by: Nikolay Aleksandrov 
---
v4: Use switch statement in br_mdb_switchdev_host_port()
---
 include/net/switchdev.h   |  1 +
 net/bridge/br_mdb.c   | 43 +++
 net/switchdev/switchdev.c |  2 ++
 3 files changed, 46 insertions(+)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d756fbe46625..39bc855d7fee 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -76,6 +76,7 @@ enum switchdev_obj_id {
SWITCHDEV_OBJ_ID_UNDEFINED,
SWITCHDEV_OBJ_ID_PORT_VLAN,
SWITCHDEV_OBJ_ID_PORT_MDB,
+   SWITCHDEV_OBJ_ID_HOST_MDB,
 };
 
 struct switchdev_obj {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 702408d2a93c..b0f4c734900b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -292,6 +292,46 @@ static void br_mdb_complete(struct net_device *dev, int 
err, void *priv)
kfree(priv);
 }
 
+static void br_mdb_switchdev_host_port(struct net_device *dev,
+  struct net_device *lower_dev,
+  struct br_mdb_entry *entry, int type)
+{
+   struct switchdev_obj_port_mdb mdb = {
+   .obj = {
+   .id = SWITCHDEV_OBJ_ID_HOST_MDB,
+   .flags = SWITCHDEV_F_DEFER,
+   },
+   .vid = entry->vid,
+   };
+
+   if (entry->addr.proto == htons(ETH_P_IP))
+   ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+#if IS_ENABLED(CONFIG_IPV6)
+   else
+   ipv6_eth_mc_map(>addr.u.ip6, mdb.addr);
+#endif
+
+   mdb.obj.orig_dev = dev;
+   switch (type) {
+   case RTM_NEWMDB:
+   switchdev_port_obj_add(lower_dev, );
+   break;
+   case RTM_DELMDB:
+   switchdev_port_obj_del(lower_dev, );
+   break;
+   }
+}
+
+static void br_mdb_switchdev_host(struct net_device *dev,
+ struct br_mdb_entry *entry, int type)
+{
+   struct net_device *lower_dev;
+   struct list_head *iter;
+
+   netdev_for_each_lower_dev(dev, lower_dev, iter)
+   br_mdb_switchdev_host_port(dev, lower_dev, entry, type);
+}
+
 static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
struct br_mdb_entry *entry, int type)
 {
@@ -331,6 +371,9 @@ static void __br_mdb_notify(struct net_device *dev, struct 
net_bridge_port *p,
switchdev_port_obj_del(port_dev, );
}
 
+   if (!p)
+   br_mdb_switchdev_host(dev, entry, type);
+
skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
if (!skb)
goto errout;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 0531b41d1f2d..74b9d916a58b 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -345,6 +345,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj 
*obj)
return sizeof(struct switchdev_obj_port_vlan);
case SWITCHDEV_OBJ_ID_PORT_MDB:
return sizeof(struct switchdev_obj_port_mdb);
+   case SWITCHDEV_OBJ_ID_HOST_MDB:
+   return sizeof(struct switchdev_obj_port_mdb);
default:
BUG();
}
-- 
2.15.0

[PATCH v4 net-next 5/6] net: dsa: add more const attributes

2017-11-09 Thread Andrew Lunn

The notify mechanism does not need to modify the port it is notifying.
So make the parameter const.

Signed-off-by: Andrew Lunn 
---
 net/dsa/dsa_priv.h | 4 ++--
 net/dsa/port.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index bb0218c1b570..507e1ce4d4d2 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -147,10 +147,10 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned 
char *addr,
 int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
 u16 vid);
 int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
-int dsa_port_mdb_add(struct dsa_port *dp,
+int dsa_port_mdb_add(const struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb,
 struct switchdev_trans *trans);
-int dsa_port_mdb_del(struct dsa_port *dp,
+int dsa_port_mdb_del(const struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb);
 int dsa_port_vlan_add(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bb30b1a7de3a..a85cd63a91c4 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -17,7 +17,7 @@
 
 #include "dsa_priv.h"
 
-static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v)
+static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v)
 {
struct raw_notifier_head *nh = >ds->dst->nh;
int err;
@@ -215,7 +215,7 @@ int dsa_port_fdb_dump(struct dsa_port *dp, 
dsa_fdb_dump_cb_t *cb, void *data)
return ds->ops->port_fdb_dump(ds, port, cb, data);
 }
 
-int dsa_port_mdb_add(struct dsa_port *dp,
+int dsa_port_mdb_add(const struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb,
 struct switchdev_trans *trans)
 {
@@ -229,7 +229,7 @@ int dsa_port_mdb_add(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, );
 }
 
-int dsa_port_mdb_del(struct dsa_port *dp,
+int dsa_port_mdb_del(const struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb)
 {
struct dsa_notifier_mdb_info info = {
-- 
2.15.0

[PATCH v4 net-next 6/6] net: dsa: switch: Don't add CPU port to an mdb by default

2017-11-09 Thread Andrew Lunn

Now that the host indicates when a multicast group should be forwarded
from the switch to the host, don't do it by default.

Signed-off-by: Andrew Lunn 
---
v4:
Fix unrelated white space change
---
 net/dsa/switch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index e6c06aa349a6..1155e43c157f 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -121,7 +121,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
if (ds->index == info->sw_index)
set_bit(info->port, group);
for (port = 0; port < ds->num_ports; port++)
-   if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+   if (dsa_is_dsa_port(ds, port))
set_bit(port, group);
 
if (switchdev_trans_ph_prepare(trans)) {
-- 
2.15.0

1 2 3 >

1 - 100 of 239 matches

Mail list logo