Re: [PATCH] mlxsw: core: Fix an error handling path in \'mlxsw_core_bus_device_register()\'

2018-05-10 Thread Arkadi Sharshevsky
Hi Dan,

I will fix the error path. Regarding the goto label this is
the convention in the driver.

Thanks,
Arkadi


[PATCH net] devlink: Remove redundant free on error path

2018-03-18 Thread Arkadi Sharshevsky
The current code performs unneeded free. Remove the redundant skb freeing
during the error path.

Fixes: 1555d204e743 ("devlink: Support for pipeline debug (dpipe)")
Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 net/core/devlink.c | 16 
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index f23e5ed..7917838 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1798,7 +1798,7 @@ static int devlink_dpipe_tables_fill(struct genl_info 
*info,
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(, info);
if (err)
-   goto err_skb_send_alloc;
+   return err;
goto send_done;
}
 
@@ -1807,7 +1807,6 @@ static int devlink_dpipe_tables_fill(struct genl_info 
*info,
 nla_put_failure:
err = -EMSGSIZE;
 err_table_put:
-err_skb_send_alloc:
genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
@@ -2073,7 +2072,7 @@ static int devlink_dpipe_entries_fill(struct genl_info 
*info,
 table->counters_enabled,
 _ctx);
if (err)
-   goto err_entries_dump;
+   return err;
 
 send_done:
nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
@@ -2081,16 +2080,10 @@ static int devlink_dpipe_entries_fill(struct genl_info 
*info,
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(_ctx.skb, info);
if (err)
-   goto err_skb_send_alloc;
+   return err;
goto send_done;
}
return genlmsg_reply(dump_ctx.skb, info);
-
-err_entries_dump:
-err_skb_send_alloc:
-   genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr);
-   nlmsg_free(dump_ctx.skb);
-   return err;
 }
 
 static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
@@ -2229,7 +,7 @@ static int devlink_dpipe_headers_fill(struct genl_info 
*info,
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(, info);
if (err)
-   goto err_skb_send_alloc;
+   return err;
goto send_done;
}
return genlmsg_reply(skb, info);
@@ -2237,7 +2230,6 @@ static int devlink_dpipe_headers_fill(struct genl_info 
*info,
 nla_put_failure:
err = -EMSGSIZE;
 err_table_put:
-err_skb_send_alloc:
genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
-- 
2.4.11



[PATCH net-next] devlink: Change dpipe/resource get privileges

2018-03-08 Thread Arkadi Sharshevsky
Let dpipe/resource be retrieved by unprivileged users.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 net/core/devlink.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 5bdc61e..f783ea9 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2733,22 +2733,22 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
.doit = devlink_nl_cmd_dpipe_table_get,
.policy = devlink_nl_policy,
-   .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+   /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
.doit = devlink_nl_cmd_dpipe_entries_get,
.policy = devlink_nl_policy,
-   .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+   /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
.doit = devlink_nl_cmd_dpipe_headers_get,
.policy = devlink_nl_policy,
-   .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+   /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
@@ -2768,8 +2768,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_RESOURCE_DUMP,
.doit = devlink_nl_cmd_resource_dump,
.policy = devlink_nl_policy,
-   .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+   /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RELOAD,
-- 
2.4.11



[PATCH net] team: Fix double free in error path

2018-03-08 Thread Arkadi Sharshevsky
The __send_and_alloc_skb() receives a skb ptr as a parameter but in
case it fails the skb is not valid:
- Send failed and released the skb internally.
- Allocation failed.

The current code tries to release the skb in case of failure which
causes redundant freeing.

Fixes: 9b00cf2d1024 ("team: implement multipart netlink messages for options 
transfers")
Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 drivers/net/team/team.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a468439..56c701b 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2395,7 +2395,7 @@ static int team_nl_send_options_get(struct team *team, 
u32 portid, u32 seq,
if (!nlh) {
err = __send_and_alloc_skb(, team, portid, send_func);
if (err)
-   goto errout;
+   return err;
goto send_done;
}
 
@@ -2681,7 +2681,7 @@ static int team_nl_send_port_list_get(struct team *team, 
u32 portid, u32 seq,
if (!nlh) {
err = __send_and_alloc_skb(, team, portid, send_func);
if (err)
-   goto errout;
+   return err;
goto send_done;
}
 
-- 
2.4.11



[PATCH net-next] selftests: Extend the tc action test for action mirror

2018-03-04 Thread Arkadi Sharshevsky
Currently the tc action test is used only to test mirred redirect
action. This patch extends it for mirred mirror.

Signed-off-by: Jiri Pirko <j...@mellanox.com>
Reviewed-by: Ido Schimmel <ido...@mellanox.com>
Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 tools/testing/selftests/net/forwarding/tc_actions.sh | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh 
b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 8423431..bc09a36 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -45,8 +45,10 @@ switch_destroy()
simple_if_fini $swp1 192.0.2.2/24
 }
 
-mirred_egress_redirect_test()
+mirred_egress_test()
 {
+   local action=$1
+
RET=0
 
tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
@@ -59,19 +61,19 @@ mirred_egress_redirect_test()
check_fail $? "Matched without redirect rule inserted"
 
tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
-   $tcflags dst_ip 192.0.2.2 action mirred egress redirect \
+   $tcflags dst_ip 192.0.2.2 action mirred egress $action \
dev $swp2
 
$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
-t ip -q
 
tc_check_packets "dev $h2 ingress" 101 1
-   check_err $? "Did not match incoming redirected packet"
+   check_err $? "Did not match incoming $action packet"
 
tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
 
-   log_test "mirred egress redirect ($tcflags)"
+   log_test "mirred egress $action ($tcflags)"
 }
 
 gact_drop_and_ok_test()
@@ -180,7 +182,8 @@ setup_prepare
 setup_wait
 
 gact_drop_and_ok_test
-mirred_egress_redirect_test
+mirred_egress_test "redirect"
+mirred_egress_test "mirror"
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
@@ -188,7 +191,8 @@ if [[ $? -ne 0 ]]; then
 else
tcflags="skip_sw"
gact_drop_and_ok_test
-   mirred_egress_redirect_test
+   mirred_egress_test "redirect"
+   mirred_egress_test "mirror"
gact_trap_test
 fi
 
-- 
2.4.11



[PATCH iproute2] devlink: Fix error reporting

2018-02-28 Thread Arkadi Sharshevsky
The current code doesn't set errno in case of extended ack.

Fixes: 049c58539f5d ("devlink: mnlg: Add support for extended ack")
Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/mnlg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devlink/mnlg.c b/devlink/mnlg.c
index 37c5687..3d28453 100644
--- a/devlink/mnlg.c
+++ b/devlink/mnlg.c
@@ -71,15 +71,15 @@ static int mnlg_cb_error(const struct nlmsghdr *nlh, void 
*data)
 {
const struct nlmsgerr *err = mnl_nlmsg_get_payload(nlh);
 
-   if (nl_dump_ext_ack(nlh, NULL))
-   return MNL_CB_STOP;
-
/* Netlink subsystems returns the errno value with different signess */
if (err->error < 0)
errno = -err->error;
else
errno = err->error;
 
+   if (nl_dump_ext_ack(nlh, NULL))
+   return MNL_CB_ERROR;
+
return err->error == 0 ? MNL_CB_STOP : MNL_CB_ERROR;
 }
 
-- 
2.4.11



[PATCH net-next] devlink: Fix resource coverity errors

2018-02-26 Thread Arkadi Sharshevsky
Fix resource coverity errors.

Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 net/core/devlink.c | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index d6310f7..617a312 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1695,10 +1695,11 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
goto nla_put_failure;
 
if (table->resource_valid) {
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
- table->resource_id, DEVLINK_ATTR_PAD);
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
- table->resource_units, DEVLINK_ATTR_PAD);
+   if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+ table->resource_id, DEVLINK_ATTR_PAD) ||
+   nla_put_u64_64bit(skb, 
DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+ table->resource_units, DEVLINK_ATTR_PAD))
+   goto nla_put_failure;
}
if (devlink_dpipe_matches_put(table, skb))
goto nla_put_failure;
@@ -2394,20 +2395,22 @@ static int devlink_nl_cmd_resource_set(struct sk_buff 
*skb,
return 0;
 }
 
-static void
+static int
 devlink_resource_size_params_put(struct devlink_resource *resource,
 struct sk_buff *skb)
 {
struct devlink_resource_size_params *size_params;
 
size_params = resource->size_params;
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
- size_params->size_granularity, DEVLINK_ATTR_PAD);
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
- size_params->size_max, DEVLINK_ATTR_PAD);
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
- size_params->size_min, DEVLINK_ATTR_PAD);
-   nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit);
+   if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+ size_params->size_granularity, DEVLINK_ATTR_PAD) 
||
+   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+ size_params->size_max, DEVLINK_ATTR_PAD) ||
+   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+ size_params->size_min, DEVLINK_ATTR_PAD) ||
+   nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
+   return -EMSGSIZE;
+   return 0;
 }
 
 static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
@@ -2431,10 +2434,12 @@ static int devlink_resource_put(struct devlink 
*devlink, struct sk_buff *skb,
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
  resource->size_new, DEVLINK_ATTR_PAD);
if (resource->resource_ops && resource->resource_ops->occ_get)
-   nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
- resource->resource_ops->occ_get(devlink),
- DEVLINK_ATTR_PAD);
-   devlink_resource_size_params_put(resource, skb);
+   if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+ resource->resource_ops->occ_get(devlink),
+ DEVLINK_ATTR_PAD))
+   goto nla_put_failure;
+   if (devlink_resource_size_params_put(resource, skb))
+   goto nla_put_failure;
if (list_empty(>resource_list))
goto out;
 
-- 
2.4.11



Re: [net-next PATCH 2/2] mlxsw: spectrum_kvdl: avoid uninitialized variable warning

2018-02-25 Thread Arkadi Sharshevsky


On 02/23/2018 03:15 PM, Arnd Bergmann wrote:
> gcc warns that 'resource_id' is not initialized if we don't come though
> any of the three 'case' statements before:
> 
> drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c: In function 
> 'mlxsw_sp_kvdl_part_init':
> drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c:275:8: error: 
> 'resource_id' may be used uninitialized in this function 
> [-Werror=maybe-uninitialized]
> 
> In the current code, that won't happen, but it's more robust to explicitly
> handle this by returning a failure from mlxsw_sp_kvdl_part_init.
> 
> Fixes: 887839e6960d ("mlxsw: spectrum_kvdl: Add support for dynamic partition 
> set")
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> ---
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> index 6fd701db90c9..059eb3214328 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> @@ -270,6 +270,8 @@ static int mlxsw_sp_kvdl_part_init(struct mlxsw_sp 
> *mlxsw_sp,
>   case MLXSW_SP_KVDL_PART_LARGE_CHUNKS:
>   resource_id = MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS;
>   break;
> + default:
> + return -EINVAL;
>   }
>  
>   err = devlink_resource_size_get(devlink, resource_id, _size);
> 
Acked-by: Arkadi Sharshevsky <arka...@mellanox.com>


Re: [net-next PATCH 1/2] mlxsw: spectrum_kvdl: use div_u64() for 64-bit division

2018-02-25 Thread Arkadi Sharshevsky


On 02/23/2018 03:15 PM, Arnd Bergmann wrote:
> Calculating the number of entries now uses 64-bit arithmetic that
> causes a link error on 32-bit architectures:
> 
> drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.o: In function 
> `mlxsw_sp_kvdl_init':
> spectrum_kvdl.c:(.text+0x51c): undefined reference to `__aeabi_uldivmod'
> 
> We could probably use a 32-bit division here as before, but since this is
> not in a performance critical path, div_u64() seems cleaner here.
> 
> Fixes: 887839e6960d ("mlxsw: spectrum_kvdl: Add support for dynamic partition 
> set")
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> ---
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> index d27fa57ad3c3..6fd701db90c9 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> @@ -278,7 +278,7 @@ static int mlxsw_sp_kvdl_part_init(struct mlxsw_sp 
> *mlxsw_sp,
>   resource_size = info->end_index - info->start_index + 1;
>   }
>  
> - nr_entries = resource_size / info->alloc_size;
> + nr_entries = div_u64(resource_size, info->alloc_size);
>   usage_size = BITS_TO_LONGS(nr_entries) * sizeof(unsigned long);
>   part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
>   if (!part)
> 

Acked-by: Arkadi Sharshevsky <arka...@mellanox.com>


Re: [PATCH iproute2 0/7] Add support for devlink resource abstraction

2018-02-22 Thread Arkadi Sharshevsky


On 02/15/2018 05:41 AM, David Ahern wrote:
> On 2/14/18 1:55 AM, Arkadi Sharshevsky wrote:
>> Add support for devlink resource abstraction.
>>
>> Arkadi Sharshevsky (7):
>>   devlink: Change empty line indication with indentations
>>   devlink: mnlg: Add support for extended ack
>>   devlink: Add support for devlink resource abstraction
>>   devlink: Add support for hot reload
>>   devlink: Move dpipe context from heap to stack
>>   devlink: Add support for resource/dpipe relation
>>   devlink: Update man pages and add resource man
>>
>>  devlink/devlink.c   | 774 
>> 
>>  devlink/mnlg.c  |  53 ++-
>>  include/libnetlink.h|   1 +
>>  include/list.h  |   5 +
>>  lib/libnetlink.c|   4 +-
>>  man/man8/devlink-dev.8  |  15 +
>>  man/man8/devlink-resource.8 |  78 +
>>  man/man8/devlink.8  |   1 +
>>  8 files changed, 871 insertions(+), 60 deletions(-)
>>  create mode 100644 man/man8/devlink-resource.8
>>
> 
> Looks ok to me.
> 

Hi David, noticed it wasn't applied yet.


Re: [PATCH iproute2 2/7] devlink: mnlg: Add support for extended ack

2018-02-15 Thread Arkadi Sharshevsky


On 02/14/2018 05:12 PM, Stephen Hemminger wrote:
> On Wed, 14 Feb 2018 10:55:17 +0200
> Arkadi Sharshevsky <arka...@mellanox.com> wrote:
> 
>> +static mnl_cb_t mnlg_cb_array[NLMSG_MIN_TYPE] = {
>> +[NLMSG_NOOP]= mnlg_cb_noop,
>> +[NLMSG_ERROR]   = mnlg_cb_error,
>> +[NLMSG_DONE]= mnlg_cb_stop,
>> +[NLMSG_OVERRUN] = mnlg_cb_noop,
>> +};
>> +
> 
> Could be const?
> 

I pass the array to mnl_cb_run2() which will discard the 'const'
qualifier. So I dont think this is very beneficial.


[PATCH iproute2 2/7] devlink: mnlg: Add support for extended ack

2018-02-14 Thread Arkadi Sharshevsky
Add support for extended ack.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/mnlg.c   | 53 ++--
 include/libnetlink.h |  1 +
 lib/libnetlink.c |  4 ++--
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/devlink/mnlg.c b/devlink/mnlg.c
index 9e27de2..37c5687 100644
--- a/devlink/mnlg.c
+++ b/devlink/mnlg.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 
+#include "libnetlink.h"
+#include "utils.h"
 #include "mnlg.h"
 
 struct mnlg_socket {
@@ -60,6 +62,39 @@ int mnlg_socket_send(struct mnlg_socket *nlg, const struct 
nlmsghdr *nlh)
return mnl_socket_sendto(nlg->nl, nlh, nlh->nlmsg_len);
 }
 
+static int mnlg_cb_noop(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+static int mnlg_cb_error(const struct nlmsghdr *nlh, void *data)
+{
+   const struct nlmsgerr *err = mnl_nlmsg_get_payload(nlh);
+
+   if (nl_dump_ext_ack(nlh, NULL))
+   return MNL_CB_STOP;
+
+   /* Netlink subsystems returns the errno value with different signess */
+   if (err->error < 0)
+   errno = -err->error;
+   else
+   errno = err->error;
+
+   return err->error == 0 ? MNL_CB_STOP : MNL_CB_ERROR;
+}
+
+static int mnlg_cb_stop(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_STOP;
+}
+
+static mnl_cb_t mnlg_cb_array[NLMSG_MIN_TYPE] = {
+   [NLMSG_NOOP]= mnlg_cb_noop,
+   [NLMSG_ERROR]   = mnlg_cb_error,
+   [NLMSG_DONE]= mnlg_cb_stop,
+   [NLMSG_OVERRUN] = mnlg_cb_noop,
+};
+
 int mnlg_socket_recv_run(struct mnlg_socket *nlg, mnl_cb_t data_cb, void *data)
 {
int err;
@@ -69,8 +104,9 @@ int mnlg_socket_recv_run(struct mnlg_socket *nlg, mnl_cb_t 
data_cb, void *data)
  MNL_SOCKET_BUFFER_SIZE);
if (err <= 0)
break;
-   err = mnl_cb_run(nlg->buf, err, nlg->seq, nlg->portid,
-data_cb, data);
+   err = mnl_cb_run2(nlg->buf, err, nlg->seq, nlg->portid,
+ data_cb, data, mnlg_cb_array,
+ ARRAY_SIZE(mnlg_cb_array));
} while (err > 0);
 
return err;
@@ -220,6 +256,7 @@ struct mnlg_socket *mnlg_socket_open(const char 
*family_name, uint8_t version)
 {
struct mnlg_socket *nlg;
struct nlmsghdr *nlh;
+   int one = 1;
int err;
 
nlg = malloc(sizeof(*nlg));
@@ -234,6 +271,16 @@ struct mnlg_socket *mnlg_socket_open(const char 
*family_name, uint8_t version)
if (!nlg->nl)
goto err_mnl_socket_open;
 
+   err = mnl_socket_setsockopt(nlg->nl, NETLINK_CAP_ACK, ,
+   sizeof(one));
+   if (err)
+   goto err_mnl_set_ack;
+
+   err = mnl_socket_setsockopt(nlg->nl, NETLINK_EXT_ACK, ,
+   sizeof(one));
+   if (err)
+   goto err_mnl_set_ext_ack;
+
err = mnl_socket_bind(nlg->nl, 0, MNL_SOCKET_AUTOPID);
if (err < 0)
goto err_mnl_socket_bind;
@@ -258,6 +305,8 @@ struct mnlg_socket *mnlg_socket_open(const char 
*family_name, uint8_t version)
 err_mnlg_socket_recv_run:
 err_mnlg_socket_send:
 err_mnl_socket_bind:
+err_mnl_set_ext_ack:
+err_mnl_set_ack:
mnl_socket_close(nlg->nl);
 err_mnl_socket_open:
free(nlg->buf);
diff --git a/include/libnetlink.h b/include/libnetlink.h
index d632219..9d9249e 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -109,6 +109,7 @@ int rtnl_send(struct rtnl_handle *rth, const void *buf, int)
__attribute__((warn_unused_result));
 int rtnl_send_check(struct rtnl_handle *rth, const void *buf, int)
__attribute__((warn_unused_result));
+int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn);
 
 int addattr(struct nlmsghdr *n, int maxlen, int type);
 int addattr8(struct nlmsghdr *n, int maxlen, int type, __u8 data);
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 7ca47b2..8bb1c8d 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -65,7 +65,7 @@ static int err_attr_cb(const struct nlattr *attr, void *data)
 }
 
 /* dump netlink extended ack error message */
-static int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
+int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
 {
struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
const struct nlmsgerr *err = mnl_nlmsg_get_payload(nlh);
@@ -120,7 +120,7 @@ static int nl_dump_ext_ack(const struct nlmsghdr *nlh, 
nl_ext_ack_fn_t errfn)
 #warning "libmnl required for error support"
 
 /* No extended error ack without libmnl */
-static int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
+int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
 {
return 0;
 }
-- 
2.4.11



[PATCH iproute2 4/7] devlink: Add support for hot reload

2018-02-14 Thread Arkadi Sharshevsky
Add support for hot reload. It should be used in order for resource
updates to take place.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 4c392a7..21835d9 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1179,6 +1179,7 @@ static void cmd_dev_help(void)
pr_err("   [ inline-mode { none | link | 
network | transport } ]\n");
pr_err("   [ encap { disable | enable } 
]\n");
pr_err("   devlink dev eswitch show DEV\n");
+   pr_err("   devlink dev reload DEV\n");
 }
 
 static bool cmp_arr_last_handle(struct dl *dl, const char *bus_name,
@@ -1620,6 +1621,31 @@ static int cmd_dev_show(struct dl *dl)
return err;
 }
 
+static void cmd_dev_reload_help(void)
+{
+   pr_err("Usage: devlink dev reload [ DEV ]\n");
+}
+
+static int cmd_dev_reload(struct dl *dl)
+{
+   struct nlmsghdr *nlh;
+   int err;
+
+   if (dl_argv_match(dl, "help") || dl_no_arg(dl)) {
+   cmd_dev_reload_help();
+   return 0;
+   }
+
+   nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_RELOAD,
+  NLM_F_REQUEST | NLM_F_ACK);
+
+   err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE, 0);
+   if (err)
+   return err;
+
+   return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+}
+
 static int cmd_dev(struct dl *dl)
 {
if (dl_argv_match(dl, "help")) {
@@ -1632,6 +1658,9 @@ static int cmd_dev(struct dl *dl)
} else if (dl_argv_match(dl, "eswitch")) {
dl_arg_inc(dl);
return cmd_dev_eswitch(dl);
+   } else if (dl_argv_match(dl, "reload")) {
+   dl_arg_inc(dl);
+   return cmd_dev_reload(dl);
}
pr_err("Command \"%s\" not found\n", dl_argv(dl));
return -ENOENT;
-- 
2.4.11



[PATCH iproute2 5/7] devlink: Move dpipe context from heap to stack

2018-02-14 Thread Arkadi Sharshevsky
Move dpipe context to stack instead of dynamically.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 67 ++-
 1 file changed, 27 insertions(+), 40 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 21835d9..aec36ff 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -2882,25 +2882,15 @@ static void dpipe_header_del(struct dpipe_header 
*header)
list_del(>list);
 }
 
-static struct dpipe_ctx *dpipe_ctx_alloc(struct dl *dl)
+static int dpipe_ctx_init(struct dpipe_ctx *ctx, struct dl *dl)
 {
-   struct dpipe_ctx *ctx;
-
-   ctx = calloc(1, sizeof(struct dpipe_ctx));
-   if (!ctx)
-   return NULL;
ctx->dl = dl;
INIT_LIST_HEAD(>global_headers);
INIT_LIST_HEAD(>local_headers);
-   return ctx;
-}
-
-static void dpipe_ctx_free(struct dpipe_ctx *ctx)
-{
-   free(ctx);
+   return 0;
 }
 
-static void dpipe_ctx_clear(struct dpipe_ctx *ctx)
+static void dpipe_ctx_fini(struct dpipe_ctx *ctx)
 {
struct dpipe_header *header, *tmp;
 
@@ -3171,7 +3161,7 @@ static int cmd_dpipe_header_cb(const struct nlmsghdr 
*nlh, void *data)
 static int cmd_dpipe_headers_show(struct dl *dl)
 {
struct nlmsghdr *nlh;
-   struct dpipe_ctx *ctx;
+   struct dpipe_ctx ctx = {};
uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
int err;
 
@@ -3181,20 +3171,19 @@ static int cmd_dpipe_headers_show(struct dl *dl)
if (err)
return err;
 
-   ctx = dpipe_ctx_alloc(dl);
-   if (!ctx)
-   return -ENOMEM;
+   err = dpipe_ctx_init(, dl);
+   if (err)
+   return err;
 
-   ctx->print_headers = true;
+   ctx.print_headers = true;
 
pr_out_section_start(dl, "header");
-   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, ctx);
+   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, );
if (err)
-   pr_err("error get headers %s\n", strerror(ctx->err));
+   pr_err("error get headers %s\n", strerror(ctx.err));
pr_out_section_end(dl);
 
-   dpipe_ctx_clear(ctx);
-   dpipe_ctx_free(ctx);
+   dpipe_ctx_fini();
return err;
 }
 
@@ -3532,13 +3521,13 @@ static int cmd_dpipe_table_show_cb(const struct 
nlmsghdr *nlh, void *data)
 static int cmd_dpipe_table_show(struct dl *dl)
 {
struct nlmsghdr *nlh;
-   struct dpipe_ctx *ctx;
+   struct dpipe_ctx ctx = {};
uint16_t flags = NLM_F_REQUEST;
int err;
 
-   ctx = dpipe_ctx_alloc(dl);
-   if (!ctx)
-   return -ENOMEM;
+   err = dpipe_ctx_init(, dl);
+   if (err)
+   return err;
 
err = dl_argv_parse(dl, DL_OPT_HANDLE, DL_OPT_DPIPE_TABLE_NAME);
if (err)
@@ -3546,9 +3535,9 @@ static int cmd_dpipe_table_show(struct dl *dl)
 
nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_DPIPE_HEADERS_GET, flags);
dl_opts_put(nlh, dl);
-   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, ctx);
+   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, );
if (err) {
-   pr_err("error get headers %s\n", strerror(ctx->err));
+   pr_err("error get headers %s\n", strerror(ctx.err));
goto out;
}
 
@@ -3557,11 +3546,10 @@ static int cmd_dpipe_table_show(struct dl *dl)
dl_opts_put(nlh, dl);
 
pr_out_section_start(dl, "table");
-   _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_table_show_cb, ctx);
+   _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_table_show_cb, );
pr_out_section_end(dl);
 out:
-   dpipe_ctx_clear(ctx);
-   dpipe_ctx_free(ctx);
+   dpipe_ctx_fini();
return err;
 }
 
@@ -3930,13 +3918,13 @@ static int cmd_dpipe_table_entry_dump_cb(const struct 
nlmsghdr *nlh, void *data)
 static int cmd_dpipe_table_dump(struct dl *dl)
 {
struct nlmsghdr *nlh;
-   struct dpipe_ctx *ctx;
+   struct dpipe_ctx ctx = {};
uint16_t flags = NLM_F_REQUEST;
int err;
 
-   ctx = dpipe_ctx_alloc(dl);
-   if (!ctx)
-   return -ENOMEM;
+   err = dpipe_ctx_init(, dl);
+   if (err)
+   return err;
 
err = dl_argv_parse(dl, DL_OPT_HANDLE | DL_OPT_DPIPE_TABLE_NAME, 0);
if (err)
@@ -3944,9 +3932,9 @@ static int cmd_dpipe_table_dump(struct dl *dl)
 
nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_DPIPE_HEADERS_GET, flags);
dl_opts_put(nlh, dl);
-   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, ctx);
+   err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_dpipe_header_cb, );
if (err) {
-   pr_err("error get headers %s\n", strerror(ctx->err));
+

[PATCH iproute2 3/7] devlink: Add support for devlink resource abstraction

2018-02-14 Thread Arkadi Sharshevsky
Add support for devlink resource abstraction. The resources are
represented by a tree based structure and are identified by a name and
a size. Some resources can present their real time occupancy.

First the resources exposed by the driver can be observed, for example:

$devlink resource show pci/:03:00.0
pci/:03:00.0:
  name kvd size 245760 unit entry
resources:
  name linear size 98304 occ 0 unit entry size_min 0 size_max 147456 
size_gran 128
  name hash_double size 60416 unit entry size_min 32768 size_max 180224 
size_gran 128
  name hash_single size 87040 unit entry size_min 65536 size_max 212992 
size_gran 128

Some resource's size can be changed. Examples:

$devlink resource set pci/:03:00.0 path /kvd/hash_single size 73088
$devlink resource set pci/:03:00.0 path /kvd/hash_double size 74368

The changes do not apply immediately, this can be validate by the 'size_new'
attribute, which represents the pending changed size. For example

$devlink resource show pci/:03:00.0
pci/:03:00.0:
  name kvd size 245760 unit entry size_valid false
  resources:
name linear size 98304 size_new 147456 occ 0 unit entry size_min 0 size_max 
147456 size_gran 128
name hash_double size 60416 unit entry size_min 32768 size_max 180224 
size_gran 128
name hash_single size 87040 unit entry size_min 65536 size_max 212992 
size_gran 128

In case of a pending change the nested resources present an indication
for a valid configuration of its children (sum of its children sizes
doesn't exceed the parent's size).

In order for the changes to take place hot reload is needed. The hot
reload through devlink will be introduced in the following patch.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 490 +-
 include/list.h|   5 +
 2 files changed, 494 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 8ef6041..4c392a7 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -185,6 +185,8 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_DPIPE_TABLE_NAMEBIT(13)
 #define DL_OPT_DPIPE_TABLE_COUNTERSBIT(14)
 #define DL_OPT_ESWITCH_ENCAP_MODE  BIT(15)
+#define DL_OPT_RESOURCE_PATH   BIT(16)
+#define DL_OPT_RESOURCE_SIZE   BIT(17)
 
 struct dl_opts {
uint32_t present; /* flags of present items */
@@ -205,6 +207,10 @@ struct dl_opts {
const char *dpipe_table_name;
bool dpipe_counters_enable;
bool eswitch_encap_mode;
+   const char *resource_path;
+   uint32_t resource_size;
+   uint32_t resource_id;
+   bool resource_id_valid;
 };
 
 struct dl {
@@ -953,6 +959,20 @@ static int dl_argv_parse(struct dl *dl, uint32_t 
o_required,
if (err)
return err;
o_found |= DL_OPT_ESWITCH_ENCAP_MODE;
+   } else if (dl_argv_match(dl, "path") &&
+  (o_all & DL_OPT_RESOURCE_PATH)) {
+   dl_arg_inc(dl);
+   err = dl_argv_str(dl, >resource_path);
+   if (err)
+   return err;
+   o_found |= DL_OPT_RESOURCE_PATH;
+   } else if (dl_argv_match(dl, "size") &&
+  (o_all & DL_OPT_RESOURCE_SIZE)) {
+   dl_arg_inc(dl);
+   err = dl_argv_uint32_t(dl, >resource_size);
+   if (err)
+   return err;
+   o_found |= DL_OPT_RESOURCE_SIZE;
} else {
pr_err("Unknown option \"%s\"\n", dl_argv(dl));
return -EINVAL;
@@ -1095,6 +1115,12 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl 
*dl)
if (opts->present & DL_OPT_ESWITCH_ENCAP_MODE)
mnl_attr_put_u8(nlh, DEVLINK_ATTR_ESWITCH_ENCAP_MODE,
opts->eswitch_encap_mode);
+   if ((opts->present & DL_OPT_RESOURCE_PATH) && opts->resource_id_valid)
+   mnl_attr_put_u64(nlh, DEVLINK_ATTR_RESOURCE_ID,
+opts->resource_id);
+   if (opts->present & DL_OPT_RESOURCE_SIZE)
+   mnl_attr_put_u64(nlh, DEVLINK_ATTR_RESOURCE_SIZE,
+opts->resource_size);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -2684,6 +2710,91 @@ struct dpipe_header {
unsigned int fields_count;
 };
 
+struct resource {
+   char *name;
+   uint64_t size;
+   uint64_t size_new;
+   uint64_t size_min;
+   uint64_t size_max;
+   uint64_t size_gran;
+   enum devlink_resource_unit unit;
+   bool size_valid;
+   

[PATCH iproute2 1/7] devlink: Change empty line indication with indentations

2018-02-14 Thread Arkadi Sharshevsky
Currently multi-line objects are separated by new-lines. This patch
changes this behavior by using indentations for separation.

Signed-off-by: Arkadi Sharhsevsky 
Acked-by: Jiri Pirko 
---
 devlink/devlink.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 57e71ac..8ef6041 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -35,6 +35,8 @@
 #define ESWITCH_INLINE_MODE_NETWORK "network"
 #define ESWITCH_INLINE_MODE_TRANSPORT "transport"
 
+static int g_new_line_count;
+
 #define pr_err(args...) fprintf(stderr, ##args)
 #define pr_out(args...)\
do {\
@@ -43,6 +45,7 @@
g_indent_newline = false;   \
}   \
fprintf(stdout, ##args);\
+   g_new_line_count = 0;   \
} while (0)
 
 #define pr_out_sp(num, args...)\
@@ -50,6 +53,7 @@
int ret = fprintf(stdout, ##args);  \
if (ret < num)  \
fprintf(stdout, "%*s", num - ret, "");  \
+   g_new_line_count = 0;   \
} while (0)
 
 static int g_indent_level;
@@ -77,8 +81,11 @@ static void __pr_out_indent_dec(void)
 
 static void __pr_out_newline(void)
 {
-   pr_out("\n");
-   g_indent_newline = true;
+   if (g_new_line_count < 1) {
+   pr_out("\n");
+   g_indent_newline = true;
+   }
+   g_new_line_count++;
 }
 
 static int _mnlg_socket_recv_run(struct mnlg_socket *nlg,
@@ -1401,20 +1408,22 @@ static void pr_out_array_start(struct dl *dl, const 
char *name)
jsonw_name(dl->jw, name);
jsonw_start_array(dl->jw);
} else {
-   if (!g_indent_newline)
-   __pr_out_newline();
-   pr_out("%s:", name);
+   __pr_out_indent_inc();
__pr_out_newline();
+   pr_out("%s:", name);
__pr_out_indent_inc();
+   __pr_out_newline();
}
 }
 
 static void pr_out_array_end(struct dl *dl)
 {
-   if (dl->json_output)
+   if (dl->json_output) {
jsonw_end_array(dl->jw);
-   else
+   } else {
+   __pr_out_indent_dec();
__pr_out_indent_dec();
+   }
 }
 
 static void pr_out_entry_start(struct dl *dl)
-- 
2.4.11



[PATCH iproute2 0/7] Add support for devlink resource abstraction

2018-02-14 Thread Arkadi Sharshevsky
Add support for devlink resource abstraction.

Arkadi Sharshevsky (7):
  devlink: Change empty line indication with indentations
  devlink: mnlg: Add support for extended ack
  devlink: Add support for devlink resource abstraction
  devlink: Add support for hot reload
  devlink: Move dpipe context from heap to stack
  devlink: Add support for resource/dpipe relation
  devlink: Update man pages and add resource man

 devlink/devlink.c   | 774 
 devlink/mnlg.c  |  53 ++-
 include/libnetlink.h|   1 +
 include/list.h  |   5 +
 lib/libnetlink.c|   4 +-
 man/man8/devlink-dev.8  |  15 +
 man/man8/devlink-resource.8 |  78 +
 man/man8/devlink.8  |   1 +
 8 files changed, 871 insertions(+), 60 deletions(-)
 create mode 100644 man/man8/devlink-resource.8

-- 
2.4.11



[PATCH iproute2 6/7] devlink: Add support for resource/dpipe relation

2018-02-14 Thread Arkadi Sharshevsky
Dpipe - Each dpipe table can have one resource which is mapped to it.
The resource is presented via its full path. Furthermore, the number
of units consumed by single table entry is presented.

Resource - Each resource presents the dpipe tables that use it.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 201 +++---
 1 file changed, 175 insertions(+), 26 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index aec36ff..47120ce 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -2739,6 +2739,17 @@ struct dpipe_header {
unsigned int fields_count;
 };
 
+struct dpipe_table {
+   struct list_head list;
+   char *name;
+   unsigned int resource_id;
+   bool resource_valid;
+};
+
+struct dpipe_tables {
+   struct list_head table_list;
+};
+
 struct resource {
char *name;
uint64_t size;
@@ -2764,6 +2775,7 @@ struct resource_ctx {
struct dl *dl;
int err;
struct resources *resources;
+   struct dpipe_tables *tables;
bool print_resources;
bool pending_change;
 };
@@ -2829,7 +2841,10 @@ struct dpipe_ctx {
int err;
struct list_head global_headers;
struct list_head local_headers;
+   struct dpipe_tables *tables;
+   struct resources *resources;
bool print_headers;
+   bool print_tables;
 };
 
 static struct dpipe_header *dpipe_header_alloc(unsigned int fields_count)
@@ -2882,8 +2897,42 @@ static void dpipe_header_del(struct dpipe_header *header)
list_del(>list);
 }
 
+static struct dpipe_table *dpipe_table_alloc(void)
+{
+   return calloc(1, sizeof(struct dpipe_table));
+}
+
+static void dpipe_table_free(struct dpipe_table *table)
+{
+   free(table);
+}
+
+static struct dpipe_tables *dpipe_tables_alloc(void)
+{
+   struct dpipe_tables *tables;
+
+   tables = calloc(1, sizeof(struct dpipe_tables));
+   if (!tables)
+   return NULL;
+   INIT_LIST_HEAD(>table_list);
+   return tables;
+}
+
+static void dpipe_tables_free(struct dpipe_tables *tables)
+{
+   struct dpipe_table *table, *tmp;
+
+   list_for_each_entry_safe(table, tmp, >table_list, list)
+   dpipe_table_free(table);
+   free(tables);
+}
+
 static int dpipe_ctx_init(struct dpipe_ctx *ctx, struct dl *dl)
 {
+   ctx->tables = dpipe_tables_alloc();
+   if (!ctx->tables)
+   return -ENOMEM;
+
ctx->dl = dl;
INIT_LIST_HEAD(>global_headers);
INIT_LIST_HEAD(>local_headers);
@@ -2906,6 +2955,7 @@ static void dpipe_ctx_fini(struct dpipe_ctx *ctx)
dpipe_header_clear(header);
dpipe_header_free(header);
}
+   dpipe_tables_free(ctx->tables);
 }
 
 static const char *dpipe_header_id2s(struct dpipe_ctx *ctx,
@@ -3440,8 +3490,10 @@ resource_path_print(struct dl *dl, struct resources 
*resources,
 static int dpipe_table_show(struct dpipe_ctx *ctx, struct nlattr *nl)
 {
struct nlattr *nla_table[DEVLINK_ATTR_MAX + 1] = {};
+   struct dpipe_table *table;
+   uint32_t resource_units;
bool counters_enabled;
-   const char *name;
+   bool resource_valid;
uint32_t size;
int err;
 
@@ -3457,15 +3509,36 @@ static int dpipe_table_show(struct dpipe_ctx *ctx, 
struct nlattr *nl)
return -EINVAL;
}
 
-   name = mnl_attr_get_str(nla_table[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+   table = dpipe_table_alloc();
+   if (!table)
+   return -ENOMEM;
+
+   table->name = 
strdup(mnl_attr_get_str(nla_table[DEVLINK_ATTR_DPIPE_TABLE_NAME]));
size = mnl_attr_get_u32(nla_table[DEVLINK_ATTR_DPIPE_TABLE_SIZE]);
counters_enabled = 
!!mnl_attr_get_u8(nla_table[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
 
-   pr_out_str(ctx->dl, "name", name);
+   resource_valid = !!nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID];
+   if (resource_valid) {
+   table->resource_id = 
mnl_attr_get_u64(nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID]);
+   table->resource_valid = true;
+   }
+
+   list_add_tail(>list, >tables->table_list);
+   if (!ctx->print_tables)
+   return 0;
+
+   pr_out_str(ctx->dl, "name", table->name);
pr_out_uint(ctx->dl, "size", size);
pr_out_str(ctx->dl, "counters_enabled",
   counters_enabled ? "true" : "false");
 
+   if (resource_valid) {
+   resource_units = 
mnl_attr_get_u32(nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS]);
+   resource_path_print(ctx->dl, ctx->resources,
+   table->resource_id);
+   pr_out_uint(ctx->dl

[PATCH iproute2 7/7] devlink: Update man pages and add resource man

2018-02-14 Thread Arkadi Sharshevsky
Add resource man, and update dev manual for reload command.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 man/man8/devlink-dev.8  | 15 +
 man/man8/devlink-resource.8 | 78 +
 man/man8/devlink.8  |  1 +
 3 files changed, 94 insertions(+)
 create mode 100644 man/man8/devlink-resource.8

diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8
index b074d57..7c749dd 100644
--- a/man/man8/devlink-dev.8
+++ b/man/man8/devlink-dev.8
@@ -42,6 +42,10 @@ devlink-dev \- devlink device configuration
 .BR "devlink dev eswitch show"
 .IR DEV
 
+.ti -8
+.BR "devlink dev reload"
+.IR DEV
+
 .SH "DESCRIPTION"
 .SS devlink dev show - display devlink device attributes
 
@@ -94,6 +98,12 @@ Set eswitch encapsulation support
 .I enable
 - Enable encapsulation support
 
+.SS devlink dev reload - perform hot reload of the driver.
+
+.PP
+.I "DEV"
+- Specifies the devlink device to reload.
+
 .SH "EXAMPLES"
 .PP
 devlink dev show
@@ -114,6 +124,11 @@ Shows the eswitch mode of specified devlink device.
 devlink dev eswitch set pci/:01:00.0 mode switchdev
 .RS 4
 Sets the eswitch mode of specified devlink device to switchdev.
+.RE
+.PP
+devlink dev reload pci/:01:00.0
+.RS 4
+Performs hot reload of specified devlink device.
 
 .SH SEE ALSO
 .BR devlink (8),
diff --git a/man/man8/devlink-resource.8 b/man/man8/devlink-resource.8
new file mode 100644
index 000..b8f7880
--- /dev/null
+++ b/man/man8/devlink-resource.8
@@ -0,0 +1,78 @@
+.TH DEVLINK\-RESOURCE 8 "11 Feb 2018" "iproute2" "Linux"
+.SH NAME
+devlink-resource \- devlink device resource configuration
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B devlink
+.RI "[ " OPTIONS " ]"
+.B resource
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.IR OPTIONS " := { "
+\fB\-v\fR[\fIerbose\fR] }
+
+.ti -8
+.B devlink resource show
+.IR DEV
+
+.ti -8
+.B devlink resource help
+
+.ti -8
+.BR "devlink resource set"
+.IR DEV
+.BI path " RESOURCE_PATH"
+.BI size " RESOURCE_SIZE"
+
+.SH "DESCRIPTION"
+.SS devlink resource show - display devlink device's resosources
+
+.PP
+.I "DEV"
+- specifies the devlink device to show.
+
+.in +4
+Format is:
+.in +2
+BUS_NAME/BUS_ADDRESS
+
+.SS devlink resource set - sets resource size of specific resource
+
+.PP
+.I "DEV"
+- specifies the devlink device.
+
+.TP
+.BI path " RESOURCE_PATH"
+Resource's path.
+
+.TP
+.BI size " RESOURCE_SIZE"
+The new resource's size.
+
+.SH "EXAMPLES"
+.PP
+devlink resource show pci/:01:00.0
+.RS 4
+Shows the resources of the specified devlink device.
+.RE
+.PP
+devlink resource set pci/0000:01:00.0 /kvd/linear 98304
+.RS 4
+Sets the size of the specified resource for the specified devlink device.
+
+.SH SEE ALSO
+.BR devlink (8),
+.BR devlink-port (8),
+.BR devlink-sb (8),
+.BR devlink-monitor (8),
+.br
+
+.SH AUTHOR
+Arkadi Sharshevsky <arka...@mellanox.com>
diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
index a975ef3..b83909d 100644
--- a/man/man8/devlink.8
+++ b/man/man8/devlink.8
@@ -103,6 +103,7 @@ Exit status is 0 if command was successful or a positive 
integer upon failure.
 .BR devlink-port (8),
 .BR devlink-monitor (8),
 .BR devlink-sb (8),
+.BR devlink-resource (8),
 .br
 
 .SH REPORTING BUGS
-- 
2.4.11



[PATCH iproute2] devlink: Ignore unknown attributes

2018-01-17 Thread Arkadi Sharshevsky
In case of extending the UAPI old packages would break.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 devlink/devlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 39cda06..c9d1838 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -343,7 +343,7 @@ static int attr_cb(const struct nlattr *attr, void *data)
int type;
 
if (mnl_attr_type_valid(attr, DEVLINK_ATTR_MAX) < 0)
-   return MNL_CB_ERROR;
+   return MNL_CB_OK;
 
type = mnl_attr_get_type(attr);
if (mnl_attr_validate(attr, devlink_policy[type]) < 0)
-- 
2.4.11



Re: [PATCH net-next] mlxsw: spectrum: Make function mlxsw_sp_kvdl_part_occ() static

2018-01-17 Thread Arkadi Sharshevsky


On 01/17/2018 05:27 AM, Wei Yongjun wrote:
> Fixes the following sparse warning:
> 
> drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c:289:5: warning:
>  symbol 'mlxsw_sp_kvdl_part_occ' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun <weiyongj...@huawei.com>

Acked-by: Arkadi Sharshevsky <arka...@mellanox.com>

> ---
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> index cfacc17..55f9d2d 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
> @@ -286,7 +286,7 @@ static void mlxsw_sp_kvdl_parts_fini(struct mlxsw_sp 
> *mlxsw_sp)
>   mlxsw_sp_kvdl_part_fini(mlxsw_sp, i);
>  }
>  
> -u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
> +static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
>  {
>   unsigned int nr_entries;
>   int bit = -1;
> 


Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-04 Thread Arkadi Sharshevsky


On 01/04/2018 05:58 PM, David Ahern wrote:
> On 1/4/18 2:24 AM, Arkadi Sharshevsky wrote:
>>> Again, my comments all stem from user experience.
>>>
>>> Can you explain what "double_word" means for a unit? I would expect a
>>> units to be kB or count (or items or entries).
>>>
>>
>> Double word is 64 bit, dont understand why this is confusing.
> 
> As Andrew pointed out, double word can have a range of sizes.
> 
> To my point here, a 'unit' for a number should not be the number of bits
> it is represented by. I believe all of the kvd sizes are in entries
> (ie., a linear size of 98304 means I can have 98,304 entries in that
> resource).
> 
>>
>>> $ devlink resource show pci/:03:00.0
>>> pci/:03:00.0:
>>>   name kvd size 245760 unit double_word size_valid true
>>>   resources:
>>> name linear size 98304 occ 0 unit double_word
>>> name hash_double size 60416 unit double_word
>>> name hash_single size 87040 unit double_word
>>>
>>> While that is confusing here from the userspace command it goes hand in
>>> hand with patch 2 and:
>>>
>>> +enum devlink_resource_unit {
>>> +   DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
>>> +};
>>>
>>>
>>> Also, it seems like the occ of 0 is wrong since we know from past
>>> responses that if I set linear to 0 all of networking breaks.
>>
>> You are clearly misunderstanding what is occupancy of the resource
>> and what kvd linear is good for. As I mentioned in the last response
>> kvd linear is mapped to adjacency table. So in case its 0 no nexthop
>> routes could be configured, this information is provided by the
>> dpipe<-->resource.
>>
>> Occupancy means how much of the resource is used right now, why is
>> this wrong? and how its related to the size 0 exactly?
> 
> The summary line above shows the current kvd/linear occupancy is '0'.
> That suggests my L3 only deployment is not using any kvd/linear which
> means I can set its allocation to 0 and devote more kvd resources to the
> hash regions.
> 
> But, as I pointed out in previous responses I can not set linear to 0.
> If I do all of networking breaks. That suggests that kvd/linear is used
> by more networking entities than you are currently reporting. Hence,
> telling me the kvd/linear occupancy is 0 is wrong.
> 
> I believe the stems from the how you are determining occupancy and the
> fact that not all tables have been implemented. Showing the current
> occupancy of a resource is very helpful, so it should be part of the API.
> 
> However, until mlxsw shows a proper value (either by implementing all
> dpipe tables or changing how it is calculated) it should not be
> returning that attribute. As it stands you are giving a user invalid data.
> 

Wait, the occupancy is very precise it goes directly to the linear
allocator inside the driver and gives exact current usage. You assumed
it goes to the dpipe tables which is incorrect.

The linear kvd memory is used by:
1. The adjacency table is responsible for the ECMP and nexthop
   resolution.
2. ACL flexible actions blocks (dpipe will contain table for this).

I dont know what you configured but in case you got some remote
routes with a nexthop the occ should not be zero.

If your router contains only local routes and dont use ACL you can
shrink it to zero, no problem.

>>
>>>
>>>
>>>
>>> How does a user learn the granularity of a resource:
>>>
>>> $ devlink resource set pci/:03:00.0 path /kvd/hash_double size 5
>>> Error: mlxsw_spectrum: resource set with wrong granularity.
>>>
>>> Try again with 51000 and then 52000 and ... Why not export the
>>> granularity read-only? I don't see it in the proposed UAPI.
>>>
>>
>> I would like more adding the granularity size to the extack string
>> instead of adding this to UAPI. The user will try to update once
>> and will get the required granularity in the error message.
> 
> A user should not have to run a command to get an error message to know
> essential data to running a command with the right value. Further, do
> not assume 'user' is a person or the devlink command.
> 
> Since granularity is essential to running a valid command, it should be
> an attribute for each resource.
> 

This is also your approach towards the min/max for resource sizes?
Am I correct?

> 
>>
>>>
>>> And then on the reload:
>>>
>>> $ devlink reload pci/:03:00.0
>>> Error: devlink: resources size validation failed.
>>>
>>> Since the reload is not doing any resource sizing that error message is
>>> confusing. Maybe something like "Sum of the resource components exceeds
>>> total size."
>>>
>>
>> No problem, sounds better.
>>
>>>
>>> Finally, I still contend a 1-line description of each of the resources
>>> goes a long way to improving the user friendliness of this set.
>>>
>>
>> Strongly against it.
>>
> 


Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-04 Thread Arkadi Sharshevsky


On 01/04/2018 04:28 AM, David Ahern wrote:
> On 1/3/18 11:05 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 01/02/2018 08:05 PM, David Ahern wrote:
>>> On 1/1/18 7:58 AM, Arkadi Sharshevsky wrote:
>>>>
>>>> Just to summarize the current fixes required:
>>>>
>>>> 1. ERIF dpipe table size is reporting wrong size. More precisely the
>>>>ERIF table does not take rifs, so it should not be linked to the rif
>>>>bank resource (is not part of this patchset, future extension).
>>>> 2. Extended ACK user-space bug.
>>>> 3. ABI documentation- Not sure we agreed upon it, Jiri?
>>>>
>>>> If I missed something please respond. Nothing of the fixes mentioned
>>>> above is relevant for this patchset actually.
>>>>
>>>
>>> Can you fix the userspace command and then we come back to what else is
>>> needed? Right now, it is hard to tell what is a user space bug and what
>>> is a kernel space bug.
>>>
>>> For example:
>>> $ devlink resource set pci/:03:00.0 path /kvd/linear size 1
>>> $ devlink resource show pci/:03:00.0
>>> pci/:03:00.0:
>>>   name kvd size 245760 size_valid true
>>>   resources:
>>> name linear size 98304 occ 0
>>> name hash_double size 60416
>>> name hash_single size 87040
>>>
>>> The set command did not fail, yet there is no size_new arg in the output
>>> like there is for this change:
>>>
>>> $ devlink resource set pci/:03:00.0 path /kvd/linear size 0
>>> $ devlink resource show pci/:03:00.0
>>> pci/:03:00.0:
>>>   name kvd size 245760 size_valid true
>>>   resources:
>>> name linear size 98304 size_new 0 occ 0
>>> name hash_double size 60416
>>> name hash_single size 87040
>>>
>>
>> As I stated this is a user-space bug which I fixed, and updated my repo
>> so please pull. Devlink uses mnl,and currently mnl does not support
>> extended ack. I added support for this in my local ver of libmnl:
>>
>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Farkadis%2Flibmnl.git=02%7C01%7Carkadis%40mellanox.com%7C9a369b54cdec48a5e1d208d5531adbdb%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636506297202356822=6KOkBz5PHqu6z8nlexSdggZj42LE4geiYVFA%2BgcgaPE%3D=0
>>
>> On branch master, so you can check it out. Besides this bugs, which were
>> userspace, can please specify what are the pending problems from your
>> point of view? Thanks!
>>
> 
> Again, my comments all stem from user experience.
> 
> Can you explain what "double_word" means for a unit? I would expect a
> units to be kB or count (or items or entries).
>

Double word is 64 bit, dont understand why this is confusing.

> $ devlink resource show pci/:03:00.0
> pci/:03:00.0:
>   name kvd size 245760 unit double_word size_valid true
>   resources:
> name linear size 98304 occ 0 unit double_word
> name hash_double size 60416 unit double_word
> name hash_single size 87040 unit double_word
> 
> While that is confusing here from the userspace command it goes hand in
> hand with patch 2 and:
> 
> +enum devlink_resource_unit {
> + DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
> +};
> 
> 
> Also, it seems like the occ of 0 is wrong since we know from past
> responses that if I set linear to 0 all of networking breaks.

You are clearly misunderstanding what is occupancy of the resource
and what kvd linear is good for. As I mentioned in the last response
kvd linear is mapped to adjacency table. So in case its 0 no nexthop
routes could be configured, this information is provided by the
dpipe<-->resource.

Occupancy means how much of the resource is used right now, why is
this wrong? and how its related to the size 0 exactly?

> 
> 
> 
> How does a user learn the granularity of a resource:
> 
> $ devlink resource set pci/:03:00.0 path /kvd/hash_double size 5
> Error: mlxsw_spectrum: resource set with wrong granularity.
> 
> Try again with 51000 and then 52000 and ... Why not export the
> granularity read-only? I don't see it in the proposed UAPI.
> 

I would like more adding the granularity size to the extack string
instead of adding this to UAPI. The user will try to update once
and will get the required granularity in the error message.

> 
> And then on the reload:
> 
> $ devlink reload pci/:03:00.0
> Error: devlink: resources size validation failed.
> 
> Since the reload is not doing any resource sizing that error message is
> confusing. Maybe something like "Sum of the resource components exceeds
> total size."
> 

No problem, sounds better.

> 
> Finally, I still contend a 1-line description of each of the resources
> goes a long way to improving the user friendliness of this set.
>

Strongly against it.



Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-03 Thread Arkadi Sharshevsky


On 01/03/2018 08:29 PM, David Ahern wrote:
> On 1/3/18 11:17 AM, Jiri Pirko wrote:
>> Wed, Jan 03, 2018 at 07:14:16PM CET, d...@cumulusnetworks.com wrote:
>>> On 1/3/18 11:05 AM, Arkadi Sharshevsky wrote:
>>>> As I stated this is a user-space bug which I fixed, and updated my repo
>>>> so please pull. Devlink uses mnl,and currently mnl does not support
>>>> extended ack. I added support for this in my local ver of libmnl:
>>>>
>>>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Farkadis%2Flibmnl.git=02%7C01%7Carkadis%40mellanox.com%7C5c86b6240eb84459c6ae08d552d7f9a4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636506009929977440=sgrNzMhPwe63BIVxexZTjl%2FXqW51kpuRiHVhTDNaa70%3D=0
>>>>
>>>> On branch master, so you can check it out. Besides this bugs, which were
>>>> userspace, can please specify what are the pending problems from your
>>>> point of view? Thanks!
>>>
>>> devlink is in iproute2 package and it has extack support. See 'git log
>>> lib/libnetlink.c'
>>
>> Dave, devlink uses libmnl.
>>
> 
> Now I remember. You wrote it independently and but needed iproute2 be a
> delivery vehicle. It uses none of the common infrastructure from
> iproute2. Could we make this more difficult 
> 
> Sometime in the next day I will jump through the hoops to get a proper
> devlink command.
> 

This actually was very confusing, I think the extack should be
handled by libmnl and iproute should use mnl_cb_run() routines
and not to implement its own. That way we could both benefit
from that.

You actually do use libmnl in libnetlink.c only for parsing
the headers, and its a dependency for extack handling.

I see this as a completely independent user space issue, which
doesn't have to do anything with this patchset. Not to mention
that everything is working right now.











Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-03 Thread Arkadi Sharshevsky


On 01/02/2018 08:05 PM, David Ahern wrote:
> On 1/1/18 7:58 AM, Arkadi Sharshevsky wrote:
>>
>> Just to summarize the current fixes required:
>>
>> 1. ERIF dpipe table size is reporting wrong size. More precisely the
>>ERIF table does not take rifs, so it should not be linked to the rif
>>bank resource (is not part of this patchset, future extension).
>> 2. Extended ACK user-space bug.
>> 3. ABI documentation- Not sure we agreed upon it, Jiri?
>>
>> If I missed something please respond. Nothing of the fixes mentioned
>> above is relevant for this patchset actually.
>>
> 
> Can you fix the userspace command and then we come back to what else is
> needed? Right now, it is hard to tell what is a user space bug and what
> is a kernel space bug.
> 
> For example:
> $ devlink resource set pci/:03:00.0 path /kvd/linear size 1
> $ devlink resource show pci/:03:00.0
> pci/:03:00.0:
>   name kvd size 245760 size_valid true
>   resources:
> name linear size 98304 occ 0
> name hash_double size 60416
> name hash_single size 87040
> 
> The set command did not fail, yet there is no size_new arg in the output
> like there is for this change:
> 
> $ devlink resource set pci/:03:00.0 path /kvd/linear size 0
> $ devlink resource show pci/:03:00.0
> pci/:03:00.0:
>   name kvd size 245760 size_valid true
>   resources:
> name linear size 98304 size_new 0 occ 0
> name hash_double size 60416
> name hash_single size 87040
> 

As I stated this is a user-space bug which I fixed, and updated my repo
so please pull. Devlink uses mnl,and currently mnl does not support
extended ack. I added support for this in my local ver of libmnl:

https://github.com/arkadis/libmnl.git

On branch master, so you can check it out. Besides this bugs, which were
userspace, can please specify what are the pending problems from your
point of view? Thanks!












Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-01 Thread Arkadi Sharshevsky


On 12/26/2017 01:23 PM, Jiri Pirko wrote:
> From: Jiri Pirko <j...@mellanox.com>
> 
> Many of the ASIC's internal resources are limited and are shared between
> several hardware procedures. For example, unified hash-based memory can
> be used for many lookup purposes, like FDB and LPM. In many cases the user
> can provide a partitioning scheme for such a resource in order to perform
> fine tuning for his application. In such cases performing driver reload is
> needed for the changes to take place, thus this patchset also adds support
> for hot reload.
> 
> Such an abstraction can be coupled with devlink's dpipe interface, which
> models the ASIC's pipeline as a graph of match/action tables. By modeling
> the hardware resource object, and by coupling it to several dpipe tables,
> further visibility can be achieved in order to debug ASIC-wide issues.
> 
> The proposed interface will provide the user the ability to understand the
> limitations of the hardware, and receive notification regarding its occupancy.
> Furthermore, monitoring the resource occupancy can be done in real-time and
> can be useful in many cases.
> ---
> Userspace part prototype can be found at 
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Farkadis%2Fiproute2%2F=02%7C01%7Carkadis%40mellanox.com%7C1ae3d8b4854a454e21e008d54c5329e3%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636498842440762657=7MC2BFQFxjnmHqy2sOOL9VEa4ZGq6e5Z2n2WvuNgyFk%3D=0
> at resource_dev branch.
> 
> v1->v2
> - Add resource size attribute.
> - Fix split bug.
> 

Just to summarize the current fixes required:

1. ERIF dpipe table size is reporting wrong size. More precisely the
   ERIF table does not take rifs, so it should not be linked to the rif
   bank resource (is not part of this patchset, future extension).
2. Extended ACK user-space bug.
3. ABI documentation- Not sure we agreed upon it, Jiri?

If I missed something please respond. Nothing of the fixes mentioned
above is relevant for this patchset actually.

Couple of key-points:

1. Constrains\trade off about setting the sizes - this can be obtained
   trivially from the resource tree nested structure.
2. Dpipe provides the mapping of hardware processes to resources.
3. Units - each resource specifies his units, if dpipe table's size is
   X and its related to some resource its size is normalized to that
   resources basic unit.

IMO this is the most hardware exact interaction, and this is the way it
should be exported from the kernel, if something is not presented in
'user' convenient way some utilities can be implemented in userspace
to easily do it. Furthermore, some examples will be provided for the
whole kvd tree partition for different cases (IPv6 heavy etc..).
Advanced user will be able to tweak it as they like.

Regarding the 'switchdev' layer I think that kernel's software tables
like nexthops/neigh/routes should be mapped to dpipe tables and not
to resources directly:

kernel_fdb--> dpipe_fdb -->/kvd/hash_single.

> Arkadi Sharshevsky (10):
>   devlink: Add per devlink instance lock
>   devlink: Add support for resource abstraction
>   devlink: Add support for reload
>   devlink: Add relation between dpipe and resource
>   mlxsw: pci: Add support for performing bus reset
>   mlxsw: spectrum: Register KVD resources with devlink
>   mlxsw: spectrum_dpipe: Connect dpipe tables to resources
>   mlxsw: spectrum: Add support for getting kvdl occupancy
>   mlxsw: pci: Add support for getting resource through devlink
>   mlxsw: core: Add support for reload
> 
>  drivers/net/ethernet/mellanox/mlxsw/core.c |  85 ++-
>  drivers/net/ethernet/mellanox/mlxsw/core.h |  16 +-
>  drivers/net/ethernet/mellanox/mlxsw/i2c.c  |   5 +-
>  drivers/net/ethernet/mellanox/mlxsw/pci.c  |  98 ++--
>  drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 205 
>  drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  13 +
>  .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   |  72 ++-
>  .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c|  26 +
>  include/net/devlink.h  |  97 
>  include/uapi/linux/devlink.h   |  21 +
>  net/core/devlink.c | 573 
> ++---
>  11 files changed, 1079 insertions(+), 132 deletions(-)
> 


Re: [patch net-next v2 00/10] Add support for resource abstraction

2018-01-01 Thread Arkadi Sharshevsky


On 12/31/2017 05:46 PM, David Ahern wrote:
> On 12/31/17 3:52 AM, Arkadi Sharshevsky wrote:
>>> [1] This is allowed by the current patch set and perhaps it should not be:
>>>
>>> $ ip ro ls vrf vrf1101
>>> unreachable default metric 8192
>>> 11.2.51.0/24 dev swp1s0.51 proto kernel scope link src 11.2.51.1 offload
>>> 11.3.51.0/24 dev swp1s1.51 proto kernel scope link src 11.3.51.1 offload
>>> 11.4.51.0/24 dev swp1s2.51 proto kernel scope link src 11.4.51.1 offload
>>> 11.5.51.0/24 dev swp1s3.51 proto kernel scope link src 11.5.51.1 offload
>>> 11.6.51.0/24 dev swp3s0.51 proto kernel scope link src 11.6.51.1 offload
>>> 11.7.51.0/24 dev swp3s1.51 proto kernel scope link src 11.7.51.1 offload
>>> 11.8.51.0/24 dev swp3s2.51 proto kernel scope link src 11.8.51.1 offload
>>> 11.9.51.0/24 dev swp3s3.51 proto kernel scope link src 11.9.51.1 offload
>>>
>>> $ devlink resource set pci/:03:00.0 path /kvd/linear size 0
>>
>> This line actually did nothing, because size zero is not acceptable
>> see patch 6. This is pure userpsace problem that error is not shown.
> 
> Then perhaps you have a kernel side bug. After the reload I get this:
> 
> $ devlink resource show pci/:03:00.0
> pci/:03:00.0:
>   name kvd size 245760 size_valid true
>   resources:
> name linear size 0 occ 0
> name hash_double size 60416
> name hash_single size 87040
> 

Actually no bug here, the linear can be zero. This implies no nexthop
routes. The adj table uses it as you can see.

> 
>>
>> You can verify it by dumping the resources and see that there is no
>> pending change (only size and not size_new).
>>
>>> $ devlink reload pci/:03:00.0
>>> $ ip ro ls vrf vrf1101
>>> unreachable default metric 8192
>>>
>>
>> So you just performed full reload of the driver which includes
>> unregistration of all the netdevs and full init. KVD update requires
>> full teardown of the driver.
> 
> you are right, I forgot to do a networking reload. Because of the above
> (0 was actually taken) all kinds of errors are spewed on 'ifreload -av'
> and there is no change to the ro ls:
> 
> $  ip ro ls vrf vrf1101
> unreachable default metric 8192
> 
>>
>> The system will not get back to the same state after reloading,
>> It's should be done on init. But it doesn't have to be like this
>> this, each driver provides his own reload devlink op implementation
>> so in our case full blown reset is required.
>>
>>
>>> [2] Same exact result for setting hash_double to 0:
>>> $ ip ro ls vrf vrf1101
>>> unreachable default metric 8192
>>> 11.2.51.0/24 dev swp1s0.51 proto kernel scope link src 11.2.51.1 offload
>>> 11.3.51.0/24 dev swp1s1.51 proto kernel scope link src 11.3.51.1 offload
>>> 11.4.51.0/24 dev swp1s2.51 proto kernel scope link src 11.4.51.1 offload
>>> 11.5.51.0/24 dev swp1s3.51 proto kernel scope link src 11.5.51.1 offload
>>> 11.6.51.0/24 dev swp3s0.51 proto kernel scope link src 11.6.51.1 offload
>>> 11.7.51.0/24 dev swp3s1.51 proto kernel scope link src 11.7.51.1 offload
>>> 11.8.51.0/24 dev swp3s2.51 proto kernel scope link src 11.8.51.1 offload
>>> 11.9.51.0/24 dev swp3s3.51 proto kernel scope link src 11.9.51.1 offload
>>>
>>> $ devlink resource set pci/:03:00.0 path /kvd/hash_double size 0
> 
> On this command you are correct, 0 is not taken:
> $ devlink resource set pci/:03:00.0 path /kvd/hash_double size 0
> $ devlink resource show pci/:03:00.0
> pci/:03:00.0:
>   name kvd size 245760 size_valid true
>   resources:
> name linear size 0 occ 0
> name hash_double size 60416
> name hash_single size 87040
> 
> but the 'set' command did not fail with a proper extack based error
> message, so consider this another a bug report.
> 

Yeah, this is a bug, but a userspace one. I actually sniffed the nl
messages with nlmon and saw the extended ack packet with the required
string.


Re: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-31 Thread Arkadi Sharshevsky


On 12/30/2017 11:15 PM, David Ahern wrote:
> On 12/28/17 1:21 AM, Yuval Mintz wrote:
>> I think it goes the other way around. The dpipe tables are the ones that
>> can be translated to functionality; The resources are internal and 
>> HW-specific
>> representing the possible internal division of resources -
>> but a given resource sn't necessarily mapped to a single networking feature.
>> [It might be in some cases, but not in the general case]
> 
> This is what I am getting at -- a single resource /kvd/linear is used
> for multiple networking features, and those networking features do map
> to well known entities -- fdb entries, ACL entries, ipv4/v6 host
> entries, LPM entries, etc.
> 
> Nothing about the output from devlink helps the user in any way to
> understand how to change the resource values. Saying that these

The current patchset adds the following dpipe table <--> resource
relation

host4 -- hash single
host6 -- hash double
adj -- linear

By dumping the resources via the 'resource show' you can the tree like
structure, you can see that you have a tradeoff between those subparts.
So for example if a user would like to increase the number of nexthops
with the expense of neighbors, it is pretty clear. As more dpipe table
will be introduced this relations will be more complete and the user
will get the complete view of the ASIC.

Just to summarize, the user gets the following info
1. Constrains\trade off about setting the sizes -  this you get
   by the tree structure.
2. Each hardware process which use this resource is mapped to it

By combining those two you can get the most accurate information
about what your change will do. Partitioning of the KVD is very delicate
process, because the hardware is complex. Many hardware processes are
pointing to this memory and size changes effect the whole ASIC, as I
mentioned as more of the pipeline will be exposed via dpipe the user
will get a more precise vision of the hardware.

We will provide some recommended and tested configuration of the whole
mlxsw resource tree for different user scenarios. A more experienced
user can do it for himself, if he got some very special scenario.


> resources, what they mean and how they are used is MLX proprietary and
> is known only to MLX employees and those with MLX agreements is not
> acceptable. Likewise, requiring some network admin to deep dive into the
> mlxsw driver to piece together how kvd/linear (for example) is used is
> not acceptable.
> 
> The cover letter touts "Many of the ASIC's internal resources are
> limited and are shared between several hardware procedures. For example,
> unified hash-based memory can be used for many lookup purposes, like FDB
> and LPM. In many cases the user can provide a partitioning scheme for
> such a resource in order to perform fine tuning for his application."
> 
> Great, now give the user some indication of how to do that. Is setting
> /kvd/linear to 0 acceptable? If not, why? What functionality is lost?
> (Apparently, everything [1].)
>
> The dpipe tables list some correlation between the kvd resources and
> tables but that is not a complete list and again there is nothing to
> tell a user that it is only a partial list of how a kvd resource is

This is work in progress, the LPM block will be exposed as the last
L3 part. Then we will start the l2 part of the ASIC.

> used. For example, it shows ipv4 host is in /kvd/hash_single and that is
> all it shows. So if I have an ipv6 only deployment can I conclude that I
> can set /kvd/hash_single to 0? Or the reverse, can I set hash_double to
> 0 for an ipv4 only deployment? From the limited information given, it is
> reasonable for a user to assume yes and has to learn through trial and
> error what can be done. [2]
> 

So you want to add min/max size attribute? I think this its not needed.

> -
> 
> [1] This is allowed by the current patch set and perhaps it should not be:
> 
> $ ip ro ls vrf vrf1101
> unreachable default metric 8192
> 11.2.51.0/24 dev swp1s0.51 proto kernel scope link src 11.2.51.1 offload
> 11.3.51.0/24 dev swp1s1.51 proto kernel scope link src 11.3.51.1 offload
> 11.4.51.0/24 dev swp1s2.51 proto kernel scope link src 11.4.51.1 offload
> 11.5.51.0/24 dev swp1s3.51 proto kernel scope link src 11.5.51.1 offload
> 11.6.51.0/24 dev swp3s0.51 proto kernel scope link src 11.6.51.1 offload
> 11.7.51.0/24 dev swp3s1.51 proto kernel scope link src 11.7.51.1 offload
> 11.8.51.0/24 dev swp3s2.51 proto kernel scope link src 11.8.51.1 offload
> 11.9.51.0/24 dev swp3s3.51 proto kernel scope link src 11.9.51.1 offload
> 
> $ devlink resource set pci/:03:00.0 path /kvd/linear size 0

This line actually did nothing, because size zero is not acceptable
see patch 6. This is pure userpsace problem that error is not shown.

You can verify it by dumping the resources and see that there is no
pending change (only size and not size_new).

> $ devlink reload pci/:03:00.0
> $ ip ro ls vrf vrf1101
> unreachable default 

Re: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-29 Thread Arkadi Sharshevsky


On 12/28/2017 06:33 PM, David Ahern wrote:
> On 12/28/17 10:23 AM, Jiri Pirko wrote:
>>> So there are 4 tables exported to userspace:
>>> 
>>> 1. mlxsw_erif table which is not in any of the kvd regions (no
>>> resource path is given) and it has a size of 1000. Does
>>> mlxsw_erif mean a rif as in Router Interfaces? So the switch
>>> supports up to 1000 router interfaces.
>>> 
>>> 2. mlxsw_host4 in /kvd/hash_single with a size of 62. Based on
>>> the
>> Size tells you the actual size. It cannot give you max size. The
>> reason is simple. The resources are shared among multiple tables.
>> That is exactly what this resource patch makes visible.
>> 
>> 
> 
> In the erif table, the 1000 is the max not current usage. I do not
> have 1000 interfaces:
> 
> $ ip -br li sh | wc -l 597
> 
> 
> $ devlink dpipe table dump pci/:03:00.0 name mlxsw_erif ... index
> 503 match_value: type field_exact header mlxsw_meta field erif_port
> mapping ifindex mapping_value 601 value 503 action_value: type
> field_modify header mlxsw_meta field l3_forward value 1
> 
> 
> The host4 table it is current size with no maximum.
> 
> The meaning of table size needs to be consistent across tables.
> 

You are right the egress RIF table size is not correct, I will
definitely fix it, but it is not what you think it should be. So in
order to clarify this point, just a reminder:

1. Both dpipe and devlink resource are abstraction models for
hardware entities, and as a result they true to provide generic objects.
Each driver/ASIC should register his own and it absolutely proprietary
implementation. There is absolutely NO industry standard here, the only
thing that resembles a standard is that dpipe looks a bit like P4 only
because its proved to be useful for describing packet forwarding
pipelines. The host4 table is just a hardware process in the mellanox
spectrum ASIC pipeline and it should not be part of ABI, sorry I clearly
don't understand how this even came up.

2. Dpipe table is a single hardware process, most of the time it uses
some resources (for example LPM algorithm uses hash memory).

3. ERIF table is a table that is located in the end of the L3 pipeline.
The current dpipe description is not complete and that why it caused
confusion. The table performs match on rif index and packet type
(UC/MC/BC) and performs forward/drop decision. As you can see, for each
rif the table can have several entries, which provide different
statistics for different traffic types per rif, currently only the UC
is exposed with forward.

4. ASICs use shared resource for many processes, this is exactly the
behavior we want to expose!

Again, the size of the ERIF table should NOT provide the number of
rifs which are in use, simply because dpipe tables do not describe
hardware resources.

In the future the RIF bank will be exported as resource object with size
of 1000, and in order to observe how much are in use you should check
its occupancy. This is the whole reason of this interface.



Re: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-27 Thread Arkadi Sharshevsky


On 12/27/2017 06:34 PM, David Ahern wrote:
> On 12/27/17 2:09 AM, Jiri Pirko wrote:
>> Wed, Dec 27, 2017 at 05:05:09AM CET, d...@cumulusnetworks.com wrote:
>>> On 12/26/17 5:23 AM, Jiri Pirko wrote:
 From: Jiri Pirko 

 Many of the ASIC's internal resources are limited and are shared between
 several hardware procedures. For example, unified hash-based memory can
 be used for many lookup purposes, like FDB and LPM. In many cases the user
 can provide a partitioning scheme for such a resource in order to perform
 fine tuning for his application. In such cases performing driver reload is
 needed for the changes to take place, thus this patchset also adds support
 for hot reload.

 Such an abstraction can be coupled with devlink's dpipe interface, which
 models the ASIC's pipeline as a graph of match/action tables. By modeling
 the hardware resource object, and by coupling it to several dpipe tables,
 further visibility can be achieved in order to debug ASIC-wide issues.

 The proposed interface will provide the user the ability to understand the
 limitations of the hardware, and receive notification regarding its 
 occupancy.
 Furthermore, monitoring the resource occupancy can be done in real-time and
 can be useful in many cases.
>>>
>>> In the last RFC (not v1, but RFC) I asked for some kind of description
>>> for each resource, and you and Arkadi have pushed back. Let's walk
>>> through an example to see what I mean:
>>>
>>> $ devlink resource show pci/:03:00.0
>>> pci/:03:00.0:
>>>  name kvd size 245760 size_valid true
>>>  resources:
>>>name linear size 98304 occ 0
>>>name hash_double size 60416
>>>name hash_single size 87040
>>>
>>> So this 2700 has 3 resources that can be managed -- some table or
>>> resource or something named 'kvd' with linear, hash_double and
>>> hash_single sub-resources. What are these names referring too? The above
>>> output gives no description, and 'kvd' is not an industry term. Further,
>>
>> This are internal resources specific to the ASIC. Would you like some
>> description to each or something like that?
> 
> devlink has some nice self-documenting capabilities. What's missing here
> is a description of what the resource is used for in standard terms --
> ipv4 host routes, fdb, nexthops, rifs, etc. Even if the description is a
> short list versus an exhaustive list of everything it is used for. e.g.,
> Why would a user decrease linear and increase hash_single or vice versa?
> 
>>
>>
>>> what are these sizes that a user can control? The output contains no
>>> units, no description, nothing. In short, the above output provides
>>> random numbers associated with random names.
>>
>> Units are now exposed from kernel, just this version of iproute2 patch
>> does not display it.
> 
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
> 
>>
>>
>>>
>>> I can see dpipe tables exported by this device:
>>>
>>> $ devlink dpipe header show pci/:03:00.0
>>>
>>> pci/:03:00.0:
>>>  name mlxsw_meta
>>>  field:
>>>name erif_port bitwidth 32 mapping_type ifindex
>>>name l3_forward bitwidth 1
>>>name l3_drop bitwidth 1
>>>name adj_index bitwidth 32
>>>name adj_size bitwidth 32
>>>name adj_hash_index bitwidth 32
>>>
>>>  name ipv6
>>>  field:
>>>name destination ip bitwidth 128
>>>
>>>  name ipv4
>>>  field:
>>>name destination ip bitwidth 32
>>>
>>>  name ethernet
>>>  field:
>>>name destination mac bitwidth 48
>>>
>>> but none mention 'kvd' or 'linear' or 'hash" and none of the other
>>> various devlink options:
>>>
>>> $ devlink
>>> Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }
>>> where  OBJECT := { dev | port | sb | monitor | dpipe }
>>>
>>> seem to related to resources.
>>>
>>> So how does a user know what they are controlling by this 'resource'
>>> option? Is the user expected to have a PRM or user guide on hand for the
>>> specific device model that is being configured?
>>
>> The relation of specific dpipe table to specific resource is exposed by
>> the kernel as well. Probably the iproute2 patch just does not display
>> it.
> 
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
>

As Yuval stated you are using the wrong command here.
You are printing the headers not the tables. On each dpipe
table you can see the resource  it is using (the resource
path aka host table uses /kvd/hash_single for example).

This is already working. Just try it.

>>
>>
>>>
>>> Again, I have no objections to kvd, linear, hash, etc terms as they do
>>> relate to Mellanox products. But kvd/linear, for example, does correlate
>>> to industry standard concepts in some way. My request is that the
>>> resource listing guide the user in some way, stating what these
>>> resources mean.
>>
>> So the 

Re: [patch net-next 02/10] devlink: Add support for resource abstraction

2017-12-25 Thread Arkadi Sharshevsky


On 12/20/2017 09:43 PM, David Miller wrote:
> From: Jiri Pirko <j...@resnulli.us>
> Date: Wed, 20 Dec 2017 12:58:13 +0100
> 
>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>
>> Add support for hardware resource abstraction over devlink. Each resource
>> is identified via id, furthermore it contains information regarding its
>> size and its related sub resources. Each resource can also provide its
>> current occupancy.
>>
>> In some cases the sizes of some resources can be changed, yet for those
>> changes to take place a hot driver reload may be needed. The reload
>> capability will be introduced in the next patch.
>>
>> Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
>> Signed-off-by: Jiri Pirko <j...@mellanox.com>
> 
> In what units are these sizes?  If it depends upon the resource, it would
> be great to have a way to introspect the units given a resource.
> 

This is problematic. Currently the units are actually double words
(single entry is 64 bit) because this resource is a actually a memory.
So my first thought was adding an enum in UAPI of resource_units

enum resource_units {
DEVLINK_RESOURCE_UNITS_WORD,
DEVLINK_RESOURCE_UNITS_DOUBLE_WORD,
DEVLINK_RESOURCE_UNITS_ITEM, /* this is in order to define some
driver specific stuff*/
...
};

But the 'item' is too vague, because for example, we will have the
RIF bank as resource. What unit will it have? rifs? items?

Any inputs on this?


>> +struct devlink_resource_ops *resource_ops;
> 
> Const?
> 
>> +static inline int
>> +devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
> 
> Const for resource_ops?
> 
>> +int devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
> 
> Likewise.
> 


Re: [patch net-next RFC v2 08/11] mlxsw: spectrum_dpipe: Connect dpipe tables to resources

2017-11-28 Thread Arkadi Sharshevsky


On 11/27/2017 06:12 PM, David Ahern wrote:
> On 11/23/17 6:40 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 11/19/2017 05:58 PM, David Ahern wrote:
>>> On 11/19/17 2:16 AM, Arkadi Sharshevsky wrote:
>>>>
>>>>
>>>> On 11/18/2017 09:19 PM, David Ahern wrote:
>>>>> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>>>>>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>>>>>
>>>>>> Connect current dpipe tables to resources. The tables are connected
>>>>>> in the following fashion:
>>>>>> 1. IPv4 host - KVD hash single
>>>>>> 2. IPv6 host - KVD hash double
>>>>>> 3. Adjacency - KVD linear
>>>>>
>>>>> Those descriptions would be helpful to the user. A description attribute
>>>>> for the resources?
>>>>>
>>>>
>>>> As described in the cover letter this resources are used by the
>>>> majority of the ASICs lookup processes. So currently there is one
>>>> to one mapping but is should increase as more tables are exposed,
>>>> so I don't think its a good idea to maintain such an attribute.
>>>>
>>>
>>> 'IPv4 host' yes, but I mean the term 'KVD hash single'? Is it the same
>>> across all h/w vendors? I have only seen that in the context of MLX. If
>>> it is a MLX term then a description to the user that KVD hash single ==
>>> IPv4 host is warranted.
>>>
>>
>> But this relation is wrong, there is no equality here. The LPM, FDB and
>> VID to FID mapping are all can be modeled as lookup tables (via dpipe)
>> that use KVD hash single resource.
>>
>> This description string will grow very long. I dont think this is the
>> right place to document such thing, eitherway, the user can dump the
>> dpipe tables and see which is mapped to what resource.
> 
> Users should not have to find a PRM or user guide for *each version of
> their hardware* to program something so fundamental. This is software.

I still don't understand, you can dump the dpipe table to see which
tables use which resource. Why I need this redundant documentation string
in the kernel?

> We can make it user friendly. Use of vendor specific terms is fine --
> allows correlation to vendor docs. But there should also be text to help

IMHO such documentation strings should not be in the kernel.

> the user correlate vendor terms to generic industry terms.
> 

Really you want to add DEVLINK_RESOURCE_DESCRIPTION string attributed?
"
Used by the following hardware tables
- VID-to-FID
- LPM
- FDB
- HOST
...
"

Again I think it is redundant, just dump those tables. No need to use
user-guides nor PRM.


Re: [patch net-next RFC v2 08/11] mlxsw: spectrum_dpipe: Connect dpipe tables to resources

2017-11-23 Thread Arkadi Sharshevsky


On 11/19/2017 05:58 PM, David Ahern wrote:
> On 11/19/17 2:16 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 11/18/2017 09:19 PM, David Ahern wrote:
>>> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>>>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>>>
>>>> Connect current dpipe tables to resources. The tables are connected
>>>> in the following fashion:
>>>> 1. IPv4 host - KVD hash single
>>>> 2. IPv6 host - KVD hash double
>>>> 3. Adjacency - KVD linear
>>>
>>> Those descriptions would be helpful to the user. A description attribute
>>> for the resources?
>>>
>>
>> As described in the cover letter this resources are used by the
>> majority of the ASICs lookup processes. So currently there is one
>> to one mapping but is should increase as more tables are exposed,
>> so I don't think its a good idea to maintain such an attribute.
>>
> 
> 'IPv4 host' yes, but I mean the term 'KVD hash single'? Is it the same
> across all h/w vendors? I have only seen that in the context of MLX. If
> it is a MLX term then a description to the user that KVD hash single ==
> IPv4 host is warranted.
> 

But this relation is wrong, there is no equality here. The LPM, FDB and
VID to FID mapping are all can be modeled as lookup tables (via dpipe)
that use KVD hash single resource.

This description string will grow very long. I dont think this is the
right place to document such thing, eitherway, the user can dump the
dpipe tables and see which is mapped to what resource.






Re: [patch net-next RFC v2 02/11] devlink: Add support for resource abstraction

2017-11-23 Thread Arkadi Sharshevsky
[...]
 +
 +  resource = devlink_resource_find(devlink, NULL, resource_id);
 +  if (!resource)
 +  return -EINVAL;
 +
 +  if (!resource->resource_ops->size_validate)
 +  return -EINVAL;
>>>
>>> genl_info has extack; please add user messages for the above failures.
>>>
>>
>> Isn't EOPNOTSUPP enough ?
> 
> No, I mean every failure above returns EINVAL. Add an extack message
> telling the user what is wrong. e.g,
> 
>   resource = devlink_resource_find(devlink, NULL, resource_id);
>   if (!resource) {
>   NL_SET_ERR_MSG(extack, "Invalid resource id");
>   return -EINVAL;
>   }
> 
> similarly for the rest.
> 

I don't understand why actually, extended ack should be used when
typical errno is not enough, for example something driver specific
like "KVD overlapping is not supported".

But here if the user provided id for setting the resource, I think
EINVAL is enough if devlink cannot find it.



Re: [patch net-next RFC v2 09/11] mlxsw: spectrum: Add support for getting kvdl occupancy

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:21 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>
>> Add support for getting the kvdl occupancy through the resource interface.
>>
> 
> Do you intend to add occ_get for the other kvd partitions?
> 

Yes of course, its a separate patchset due to its complexity.


Re: [patch net-next RFC v2 08/11] mlxsw: spectrum_dpipe: Connect dpipe tables to resources

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:19 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>
>> Connect current dpipe tables to resources. The tables are connected
>> in the following fashion:
>> 1. IPv4 host - KVD hash single
>> 2. IPv6 host - KVD hash double
>> 3. Adjacency - KVD linear
> 
> Those descriptions would be helpful to the user. A description attribute
> for the resources?
> 

As described in the cover letter this resources are used by the
majority of the ASICs lookup processes. So currently there is one
to one mapping but is should increase as more tables are exposed,
so I don't think its a good idea to maintain such an attribute.



Re: [patch net-next RFC v2 07/11] mlxsw: spectrum: Register KVD resources with devlink

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:18 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
>> b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> index d02c130..f0cbd67 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> @@ -3927,6 +3927,173 @@ static const struct mlxsw_config_profile 
>> mlxsw_sp_config_profile = {
>>  .resource_query_enable  = 1,
>>  };
>>  
>> +static bool
>> +mlxsw_sp_resource_kvd_granularity_validate(struct netlink_ext_ack *extack,
>> +   u64 size)
>> +{
>> +const struct mlxsw_config_profile *profile;
>> +
>> +profile = _sp_config_profile;
>> +if (size % profile->kvd_hash_granularity) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "resource set with wrong 
>> granularity");
>> +return false;
>> +}
>> +return true;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_size_validate(struct devlink *devlink, u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +u32 kvd_size, single_size, double_size, linear_size;
>> +struct devlink_resource *resource;
>> +
>> +kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
>> +if (kvd_size != size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "kvd size cannot be 
>> chagned");
> 
> s/chagned/changed/
> 
>> +return -EINVAL;
>> +}
>> +
>> +list_for_each_entry(resource, resource_list, list) {
>> +switch (resource->id) {
>> +case MLXSW_SP_RESOURCE_KVD_LINEAR:
>> +linear_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_SINGLE:
>> +single_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE:
>> +double_size = resource->size_new;
>> +break;
>> +}
>> +}
>> +
>> +/* Overlap is not supported */
>> +if (linear_size + single_size + double_size > kvd_size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "Overlap is not 
>> supported");
> 
> Overlap? Isn't that sum of the partitions are greater than total size?
> 

In case sum of the partitions is greater than the kvd tot size, the
hash single/double will be set in an overlapping state, which we do
not support currently.

> 
>> +return -EINVAL;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_linear_size_validate(struct devlink *devlink, u64 
>> size,
>> +   struct list_head *resource_list,
>> +   struct netlink_ext_ack *extack)
>> +{
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_hash_single_size_validate(struct devlink *devlink, 
>> u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE)) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "hash single size is 
>> smaller then min");
> 
> s/then min/than minimium/
> 
>> +return -EINVAL;
>> +}
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, 
>> u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE)) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "hash double size is 
>> smaller then min");
> 
> s/then min/than minimium/
> 
> How does the user learn the minimum size and the granularity for the KVD
> resources? Seems like those could be read-only attributes in the
> resource dump to make it easier for the user.
> 

This seems to me as too case specific and I didn't want to add
UAPI attributes for this stuff..

The resource shouldn't be define as only memory based hardware blocks.
I actually plane expose the rifs as resource as well.

I think that if the user try to configure and receives an such error
it is very clear 

Re: [patch net-next RFC v2 02/11] devlink: Add support for resource abstraction

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 08:34 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>> index 4d2c6fc..960e80a 100644
>> --- a/include/net/devlink.h
>> +++ b/include/net/devlink.h
> ...
> 
>> @@ -469,6 +523,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
>>  return 0;
>>  }
>>  
>> +static inline int
>> +devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  bool reload_required,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
>> +{
>> +return 0;
>> +}
>> +
>> +static inline void
>> +devlink_resources_unregister(struct devlink *devlink,
>> + struct devlink_resource *resource)
>> +{
>> +}
>> +
>> +static inline int
>> +devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
>> +  u64 *p_resource_size)
>> +{
>> +return -EINVAL;
> 
> It's compiled out so -EOPNOTSUPP seems more appropriate.
> 

will fix

> 
> 
>> diff --git a/net/core/devlink.c b/net/core/devlink.c
>> index 0114dfc..6ae644f 100644
>> --- a/net/core/devlink.c
>> +++ b/net/core/devlink.c
>> +static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
>> +   struct genl_info *info)
>> +{
>> +struct devlink *devlink = info->user_ptr[0];
>> +struct devlink_resource *resource;
>> +u64 resource_id;
>> +u64 size;
>> +int err;
>> +
>> +if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
>> +!info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
>> +return -EINVAL;
> 
> several of the of the DEVLINK_ATTR_RESOURCE attributes are kernel to
> user only (e.g., DEVLINK_ATTR_RESOURCE_SIZE_NEW and
> DEVLINK_ATTR_RESOURCE_RELOAD_REQUIRED), so if they are given by the user
> that should be an error too right?
> 

Not sure I understood. As you see I only check for the mandatory
attributes, if the user provides not relevant data its ignored.

We use one single nla_policy for all the commands (devlink_nl_policy)

> 
>> +resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
> 
> I don't see where these attributes are validated for proper size.
> 

right, forgot to update the policy.

>> +
>> +resource = devlink_resource_find(devlink, NULL, resource_id);
>> +if (!resource)
>> +return -EINVAL;
>> +
>> +if (!resource->resource_ops->size_validate)
>> +return -EINVAL;
> 
> genl_info has extack; please add user messages for the above failures.
> 

Isn't EOPNOTSUPP enough ?

> 
>> +
>> +size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
>> +err = resource->resource_ops->size_validate(devlink, size,
>> +>resource_list,
>> +info->extack);
>> +if (err)
>> +return err;
>> +
>> +resource->size_new = size;
>> +return 0;
>> +}
>> +
> 


Re: [patch net-next RFC v2 00/11] Add support for resource abstraction

2017-11-18 Thread Arkadi Sharshevsky


On 11/17/2017 09:58 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> From: Jiri Pirko 
>>
>> Arkadi says:
>>
>> Many of the ASIC's internal resources are limited and are shared between
>> several hardware procedures. For example, unified hash-based memory can
>> be used for many lookup purposes, like FDB and LPM. In many cases the user
>> can provide a partitioning scheme for such a resource in order to perform
>> fine tuning for his application. In many cases after setting the
>> partitioning of the resource driver reload is needed. This patchset add
>> support for hot reset of the driver.
>>
>> Such an abstraction can be coupled with devlink's dpipe interface, which
>> models the ASIC's pipeline as an graph of match/action tables. By modeling
>> the hardware resource object, and by coupling it to several dpipe tables,
>> further visibility can be achieved in order to debug ASIC-wide issues.
>>
>> The proposed interface will provide the user the ability to understand the
>> limitations of the hardware, and receive notification regarding its 
>> occupancy.
>> Furthermore, monitoring the resource occupancy can be done in real-time and
>> can be useful in many cases.
>>
>> Userspace part prototype can be found at 
>> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Farkadis%2Fiproute2%2F=02%7C01%7Carkadis%40mellanox.com%7Cbf79bc51b9c641e1c3cc08d52df59f29%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636465455333867183=JHpevZLdZJH2Imk%2FLpEaEbRTlAGMYP6GYaxTsNWHaig%3D=0
>> at resource_dev branch.
>>
> 
> now that my firmware problem is fixed, I installed a build with this
> patch set. Trying to run devlink to split a port hangs:
> 
> $ devlink port split swp1 count 4
> 
> 
> [  615.373359] INFO: task devlink:804 blocked for more than 120 seconds.
> [  615.379934]   Tainted: GW   4.14.0+ #38
> [  615.385238] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
> disables this message.
> [  615.393111] devlink D0   804771 0x0080
> [  615.393115] Call Trace:
> [  615.393126]  __schedule+0x1de/0x690
> [  615.393130]  schedule+0x36/0x80
> [  615.393139]  schedule_preempt_disabled+0xe/0x10
> [  615.393146]  __mutex_lock.isra.4+0x211/0x530
> [  615.393152]  __mutex_lock_slowpath+0x13/0x20
> [  615.393155]  ? __mutex_lock_slowpath+0x13/0x20
> [  615.393158]  mutex_lock+0x2f/0x40
> [  615.393164]  devlink_port_unregister+0x29/0x60 [devlink]
> [  615.393169]  mlxsw_core_port_fini+0x25/0x50 [mlxsw_core]
> [  615.393179]  mlxsw_sp_port_remove+0xf0/0x100 [mlxsw_spectrum]
> [  615.393186]  mlxsw_sp_port_split+0xdc/0x260 [mlxsw_spectrum]
> [  615.393193]  ? _cond_resched+0x19/0x30
> [  615.393200]  mlxsw_devlink_port_split+0x36/0x50 [mlxsw_core]
> [  615.393206]  devlink_nl_cmd_port_split_doit+0x42/0x50 [devlink]
> [  615.393212]  genl_family_rcv_msg+0x1c9/0x390
> [  615.393217]  genl_rcv_msg+0x4c/0xa0
> [  615.393220]  ? _cond_resched+0x19/0x30
> [  615.393228]  ? genl_family_rcv_msg+0x390/0x390
> [  615.393232]  netlink_rcv_skb+0xec/0x120
> [  615.393235]  genl_rcv+0x28/0x40
> [  615.393239]  netlink_unicast+0x170/0x230
> [  615.393244]  netlink_sendmsg+0x28e/0x370
> [  615.393251]  SYSC_sendto+0x10e/0x1b0
> [  615.393258]  ? __audit_syscall_entry+0xc1/0x110
> [  615.393261]  ? syscall_trace_enter+0x1c6/0x2d0
> [  615.393264]  ? __do_page_fault+0x231/0x4b0
> [  615.393268]  SyS_sendto+0xe/0x10
> [  615.393272]  do_syscall_64+0x60/0x1f0
> [  615.393277]  entry_SYSCALL64_slow_path+0x25/0x25
> [  615.393280] RIP: 0033:0x7f4ef43c16f3
> [  615.393284] RSP: 002b:7fffb907fbc8 EFLAGS: 0246 ORIG_RAX:
> 002c
> [  615.393287] RAX: ffda RBX: 013660e0 RCX:
> 7f4ef43c16f3
> [  615.393290] RDX: 0040 RSI: 01366110 RDI:
> 0003
> [  615.393291] RBP:  R08: 7f4ef4686d80 R09:
> 000c
> [  615.393292] R10:  R11: 0246 R12:
> 
> [  615.393296] R13: 0004 R14:  R15:
> 
> 

Thanks, will fix


Re: [patch net-next RFC v2 03/11] devlink: Add support for reload

2017-11-15 Thread Arkadi Sharshevsky


On 11/15/2017 10:03 AM, Jakub Kicinski wrote:
> On Tue, 14 Nov 2017 17:18:44 +0100, Jiri Pirko wrote:
>> +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info 
>> *info)
>> +{
>> +struct devlink *devlink = info->user_ptr[0];
>> +int err;
>> +
>> +if (!devlink->ops->reload)
>> +return -EOPNOTSUPP;
>> +
>> +err = devlink_resources_validate(devlink, NULL, info);
>> +if (err)
>> +return err;
>> +
>> +mutex_unlock(>lock);
>> +err = devlink->ops->reload(devlink);
>> +mutex_lock(>lock);
>> +
>> +return err;
>> +}
> 
> I'm a bit confused with the locking, why is devlink->lock not held
> around the validation?
> 

As Jiri mentioned it is held. The per devlink instance lock is taken
by default for each doit operation in the pre_doit(), because it operates
on a specific devlink instance.

The lock is released before performing the reload itself because during
the reload the driver register/unregisters devlink objects like sb/dpipe
/ports, which require the lock again, so this is done in order to avoid
recursive locking.


Re: [patch net-next RFC v2 02/11] devlink: Add support for resource abstraction

2017-11-15 Thread Arkadi Sharshevsky


On 11/15/2017 09:59 AM, Jakub Kicinski wrote:
> On Tue, 14 Nov 2017 17:18:43 +0100, Jiri Pirko wrote:
>> From: Arkadi Sharshevsky <arka...@mellanox.com>
>>
>> Add support for hardware resource abstraction over devlink. Each resource
>> is identified via id, furthermore it contains information regarding its
>> size and its related sub resources. Each resource can also provide its
>> current occupancy.
>>
>> In some cases the sizes of some resources can be changed, yet for those
>> changes to take place a hot driver reload may be needed. The reload
>> capability will be introduced in the next patch.
>>
>> Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
>> Signed-off-by: Jiri Pirko <j...@mellanox.com>
>> ---
>>  include/net/devlink.h|  80 +++
>>  include/uapi/linux/devlink.h |  10 ++
>>  net/core/devlink.c   | 330 
>> +++
>>  3 files changed, 420 insertions(+)
>>
>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>> index 4d2c6fc..960e80a 100644
>> --- a/include/net/devlink.h
>> +++ b/include/net/devlink.h
>> @@ -224,6 +225,45 @@ struct devlink_dpipe_headers {
>>  unsigned int headers_count;
>>  };
>>  
>> +/**
>> + * struct devlink_resource_ops - resource ops
>> + * @occ_get - get the occupied size
>> + * @size_validate - validate the size of the resource before update, reload
> 
> nit:
> @member: is more common and used throughout this file, rather than
> @member - 
> 

Will fix, thanks for the review.

>> + *  is needed for changes to take place
>> + */
>> +struct devlink_resource_ops {
>> +u64 (*occ_get)(struct devlink *devlink);
>> +int (*size_validate)(struct devlink *devlink, u64 size,
>> + struct list_head *resource_list,
>> + struct netlink_ext_ack *extack);
>> +};


Re: [patch net-next RFC v2 07/11] mlxsw: spectrum: Register KVD resources with devlink

2017-11-15 Thread Arkadi Sharshevsky


On 11/15/2017 02:15 AM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> +static int
>> +mlxsw_sp_resource_kvd_size_validate(struct devlink *devlink, u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +u32 kvd_size, single_size, double_size, linear_size;
>> +struct devlink_resource *resource;
>> +
>> +kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
>> +if (kvd_size != size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "kvd size cannot be 
>> chagned");
>> +return -EINVAL;
>> +}
>> +
>> +list_for_each_entry(resource, resource_list, list) {
>> +switch (resource->id) {
>> +case MLXSW_SP_RESOURCE_KVD_LINEAR:
>> +linear_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_SINGLE:
>> +single_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE:
>> +double_size = resource->size_new;
>> +break;
>> +}
>> +}
>> +
>> +/* Overlap is not supported */
>> +if (linear_size + single_size + double_size > kvd_size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "Overlap is not 
>> supported");
>> +return -EINVAL;
>> +}
>> +
>> +return 0;
>> +}
>> +
> 
> gcc warnings due to the above:
> 
> /home/dsa/kernel-3.git/drivers/net/ethernet/mellanox/mlxsw/spectrum.c:
> In function ‘mlxsw_sp_resource_kvd_size_validate’:
> /home/dsa/kernel-3.git/drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3974:32:
> warning: ‘linear_size’ may be used uninitialized in this function
> [-Wmaybe-uninitialized]
>   if (linear_size + single_size + double_size > kvd_size) {
> ^
> /home/dsa/kernel-3.git/drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3950:29:
> warning: ‘double_size’ may be used uninitialized in this function
> [-Wmaybe-uninitialized]
>   u32 kvd_size, single_size, double_size, linear_size;
>  ^
> /home/dsa/kernel-3.git/drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3950:16:
> warning: ‘single_size’ may be used uninitialized in this function
> [-Wmaybe-uninitialized]
>   u32 kvd_size, single_size, double_size, linear_size;
> ^
> 
Thanks, will fix


Re: [patch net-next RFC 1/7] devlink: Add support for resource abstraction

2017-10-29 Thread Arkadi Sharshevsky


On 10/25/2017 06:26 PM, David Ahern wrote:
> On 10/24/17 3:22 AM, Jiri Pirko wrote:
>> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
>> index 0cbca96..9db1d70 100644
>> --- a/include/uapi/linux/devlink.h
>> +++ b/include/uapi/linux/devlink.h
>> @@ -69,6 +69,8 @@ enum devlink_command {
>>  DEVLINK_CMD_DPIPE_ENTRIES_GET,
>>  DEVLINK_CMD_DPIPE_HEADERS_GET,
>>  DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
>> +DEVLINK_CMD_RESOURCE_SET,
>> +DEVLINK_CMD_RESOURCE_DUMP,
>>  
>>  /* add new commands above here */
>>  __DEVLINK_CMD_MAX,
>> @@ -201,6 +203,12 @@ enum devlink_attr {
>>  DEVLINK_ATTR_PAD,
>>  
>>  DEVLINK_ATTR_ESWITCH_ENCAP_MODE,/* u8 */
>> +DEVLINK_ATTR_RESOURCES, /* nested */
>> +DEVLINK_ATTR_RESOURCE,  /* nested */
>> +DEVLINK_ATTR_RESOURCE_NAME, /* string */
>> +DEVLINK_ATTR_RESOURCE_SIZE, /* u64 */
>> +DEVLINK_ATTR_RESOURCE_SIZE_NEW, /* u64 */
>> +DEVLINK_ATTR_RESOURCE_ID,   /* u64 */
>>  
>>  /* add new attributes above here, update the policy in devlink.c */
>>  
> 
> Where can I find the userspace patch to try out the RFC?
> 

Hi David,

Thanks for the review. I will send the updated version this week
with the userspace part.


Re: devlink dump of mlxsw_adj table triggers a panic

2017-10-08 Thread Arkadi Sharshevsky


On 10/05/2017 02:57 AM, David Ahern wrote:
> The following devlink command on a 2700 triggers a panic every time.
> Kernel is net-next at 26873308b21654b6e0785b9f9e2c5414d37a4c4c
> 
> $ devlink  dpipe table dump pci/:03:00.0 name mlxsw_adj
> devlink answers: No buffer space available
> 
> 
> I have seen several different stack traces and varying amounts of EMAD
> errors on console:
> 
> [   77.453364] mlxsw_spectrum :03:00.0: EMAD reg access failed
> (tid=64c24a43688,reg_id=200b(sfn),type=query,status=0(operation
> performed))
> [   77.466568] mlxsw_spectrum :03:00.0: Failed to get FDB notifications
> 
> If it does not reproduce for you let me know and I'll grab a trace.
> 
> David
> 

Thanks, will check it out. How many nexthops groups & overall number of
nexthops you configured?

Thanks,
Arkadi


[PATCH iproute2 3/4] devlink: Update devlink UAPI file

2017-09-07 Thread Arkadi Sharshevsky
Update devlink UAPI file.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 include/linux/devlink.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/include/linux/devlink.h b/include/linux/devlink.h
index 7644005..a62695e 100644
--- a/include/linux/devlink.h
+++ b/include/linux/devlink.h
@@ -226,4 +226,22 @@ enum devlink_dpipe_action_type {
DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY,
 };
 
+enum devlink_dpipe_field_ethernet_id {
+   DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+};
+
+enum devlink_dpipe_field_ipv4_id {
+   DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+};
+
+enum devlink_dpipe_field_ipv6_id {
+   DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+};
+
+enum devlink_dpipe_header_id {
+   DEVLINK_DPIPE_HEADER_ETHERNET,
+   DEVLINK_DPIPE_HEADER_IPV4,
+   DEVLINK_DPIPE_HEADER_IPV6,
+};
+
 #endif /* _LINUX_DEVLINK_H_ */
-- 
2.4.11



[PATCH iproute2 1/4] devlink: Make match/action parsing more flexible

2017-09-07 Thread Arkadi Sharshevsky
This patch decouples the match/action parsing from printing. This is
done as a preparation for adding the ability to print global header
values, for example print IPv4 address, which require special formatting.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 127 ++
 1 file changed, 80 insertions(+), 47 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 8f11f86..36a2b36 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -3077,27 +3077,42 @@ static const char
}
 }
 
-static void pr_out_dpipe_action(struct dpipe_ctx *ctx,
-   uint32_t header_id, uint32_t field_id,
-   uint32_t action_type, bool global)
+struct dpipe_op_info {
+   uint32_t header_id;
+   uint32_t field_id;
+   bool header_global;
+};
+
+struct dpipe_action {
+   struct dpipe_op_info info;
+   uint32_t type;
+};
+
+static void pr_out_dpipe_action(struct dpipe_action *action,
+   struct dpipe_ctx *ctx)
 {
+   struct dpipe_op_info *op_info = >info;
const char *mapping;
 
-   pr_out_str(ctx->dl, "type", dpipe_action_type_e2s(action_type));
-   pr_out_str(ctx->dl, "header", dpipe_header_id2s(ctx, header_id,
-   global));
-   pr_out_str(ctx->dl, "field", dpipe_field_id2s(ctx, header_id, field_id,
- global));
-   mapping = dpipe_mapping_get(ctx, header_id, field_id, global);
+   pr_out_str(ctx->dl, "type",
+  dpipe_action_type_e2s(action->type));
+   pr_out_str(ctx->dl, "header",
+  dpipe_header_id2s(ctx, op_info->header_id,
+op_info->header_global));
+   pr_out_str(ctx->dl, "field",
+  dpipe_field_id2s(ctx, op_info->header_id,
+   op_info->field_id,
+   op_info->header_global));
+   mapping = dpipe_mapping_get(ctx, op_info->header_id,
+   op_info->field_id,
+   op_info->header_global);
if (mapping)
pr_out_str(ctx->dl, "mapping", mapping);
 }
 
-static int dpipe_action_show(struct dpipe_ctx *ctx, struct nlattr *nl)
+static int dpipe_action_parse(struct dpipe_action *action, struct nlattr *nl)
 {
struct nlattr *nla_action[DEVLINK_ATTR_MAX + 1] = {};
-   uint32_t header_id, field_id, action_type;
-   bool global;
int err;
 
err = mnl_attr_parse_nested(nl, attr_cb, nla_action);
@@ -3111,12 +3126,11 @@ static int dpipe_action_show(struct dpipe_ctx *ctx, 
struct nlattr *nl)
return -EINVAL;
}
 
-   header_id = mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_HEADER_ID]);
-   field_id = mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_FIELD_ID]);
-   action_type = 
mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_ACTION_TYPE]);
-   global = 
!!mnl_attr_get_u8(nla_action[DEVLINK_ATTR_DPIPE_HEADER_GLOBAL]);
+   action->type = 
mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_ACTION_TYPE]);
+   action->info.header_id = 
mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_HEADER_ID]);
+   action->info.field_id = 
mnl_attr_get_u32(nla_action[DEVLINK_ATTR_DPIPE_FIELD_ID]);
+   action->info.header_global = 
!!mnl_attr_get_u8(nla_action[DEVLINK_ATTR_DPIPE_HEADER_GLOBAL]);
 
-   pr_out_dpipe_action(ctx, header_id, field_id, action_type, global);
return 0;
 }
 
@@ -3124,16 +3138,18 @@ static int dpipe_table_actions_show(struct dpipe_ctx 
*ctx,
struct nlattr *nla_actions)
 {
struct nlattr *nla_action;
+   struct dpipe_action action;
 
mnl_attr_for_each_nested(nla_action, nla_actions) {
pr_out_entry_start(ctx->dl);
-   if (dpipe_action_show(ctx, nla_action))
-   goto err_action_show;
+   if (dpipe_action_parse(, nla_action))
+   goto err_action_parse;
+   pr_out_dpipe_action(, ctx);
pr_out_entry_end(ctx->dl);
}
return 0;
 
-err_action_show:
+err_action_parse:
pr_out_entry_end(ctx->dl);
return -EINVAL;
 }
@@ -3149,28 +3165,38 @@ dpipe_match_type_e2s(enum devlink_dpipe_match_type 
match_type)
}
 }
 
-static void pr_out_dpipe_match(struct dpipe_ctx *ctx,
-  uint32_t header_id, uint32_t field_id,
-  uint32_t match_type, bool global)
+struct dpipe_match {
+   struct dpipe_op_info info;
+   uint32_t type;
+};
+
+static

[PATCH iproute2 4/4] devlink: Add support for protocol IPv4/IPv6/Ethernet special formats

2017-09-07 Thread Arkadi Sharshevsky
Add support for protocol IPv4/IPv6/Ethernet special formats.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 75 ++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index b87de38..39cda06 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "SNAPSHOT.h"
 #include "list.h"
@@ -3401,7 +3402,79 @@ struct dpipe_header_printer {
unsigned int header_id;
 };
 
-static struct dpipe_header_printer *dpipe_header_printers[] = {};
+static void dpipe_field_printer_ipv4_addr(struct dpipe_ctx *ctx,
+ enum dpipe_value_type type,
+ void *value)
+{
+   struct in_addr ip_addr;
+
+   ip_addr.s_addr = htonl(*(uint32_t *)value);
+   pr_out_str(ctx->dl, dpipe_value_type_e2s(type), inet_ntoa(ip_addr));
+}
+
+static void
+dpipe_field_printer_ethernet_addr(struct dpipe_ctx *ctx,
+ enum dpipe_value_type type,
+ void *value)
+{
+   pr_out_str(ctx->dl, dpipe_value_type_e2s(type),
+  ether_ntoa((struct ether_addr *)value));
+}
+
+static void dpipe_field_printer_ipv6_addr(struct dpipe_ctx *ctx,
+ enum dpipe_value_type type,
+ void *value)
+{
+   char str[INET6_ADDRSTRLEN];
+
+   inet_ntop(AF_INET6, value, str, INET6_ADDRSTRLEN);
+   pr_out_str(ctx->dl, dpipe_value_type_e2s(type), str);
+}
+
+static struct dpipe_field_printer dpipe_field_printers_ipv4[] = {
+   {
+   .printer = dpipe_field_printer_ipv4_addr,
+   .field_id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+   }
+};
+
+static struct dpipe_header_printer dpipe_header_printer_ipv4  = {
+   .printers = dpipe_field_printers_ipv4,
+   .printers_count = ARRAY_SIZE(dpipe_field_printers_ipv4),
+   .header_id = DEVLINK_DPIPE_HEADER_IPV4,
+};
+
+static struct dpipe_field_printer dpipe_field_printers_ethernet[] = {
+   {
+   .printer = dpipe_field_printer_ethernet_addr,
+   .field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+   },
+};
+
+static struct dpipe_header_printer dpipe_header_printer_ethernet = {
+   .printers = dpipe_field_printers_ethernet,
+   .printers_count = ARRAY_SIZE(dpipe_field_printers_ethernet),
+   .header_id = DEVLINK_DPIPE_HEADER_ETHERNET,
+};
+
+static struct dpipe_field_printer dpipe_field_printers_ipv6[] = {
+   {
+   .printer = dpipe_field_printer_ipv6_addr,
+   .field_id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+   }
+};
+
+static struct dpipe_header_printer dpipe_header_printer_ipv6 = {
+   .printers = dpipe_field_printers_ipv6,
+   .printers_count = ARRAY_SIZE(dpipe_field_printers_ipv6),
+   .header_id = DEVLINK_DPIPE_HEADER_IPV6,
+};
+
+static struct dpipe_header_printer *dpipe_header_printers[] = {
+   _header_printer_ipv4,
+   _header_printer_ethernet,
+   _header_printer_ipv6,
+};
 
 static int dpipe_print_prot_header(struct dpipe_ctx *ctx,
   struct dpipe_op_info *info,
-- 
2.4.11



[PATCH iproute2 0/4] Add support for dpipe's global header formatting

2017-09-07 Thread Arkadi Sharshevsky
Some dpipe's global header values need special formatting, for example
Ethernet and IP addresses. This patchset adds support for IPv4/6 and
Ethernet's special format.

Arkadi Sharshevsky (4):
  devlink: Make match/action parsing more flexible
  devlink: Add support for special format protocol headers
  devlink: Update devlink UAPI file
  devlink: Add support for protocol IPv4/IPv6/Ethernet special formats

 devlink/devlink.c   | 319 +---
 include/linux/devlink.h |  18 +++
 2 files changed, 265 insertions(+), 72 deletions(-)

-- 
2.4.11



[PATCH iproute2 2/4] devlink: Add support for special format protocol headers

2017-09-07 Thread Arkadi Sharshevsky
In case of global header (protocol header), the header:field ids are used
to perform lookup for special format printer. In case no printer existence
fallback to plain value printing.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 devlink/devlink.c | 119 ++
 1 file changed, 94 insertions(+), 25 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 36a2b36..b87de38 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -3372,9 +3372,89 @@ static int cmd_dpipe_table_set(struct dl *dl)
return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
 }
 
-static int dpipe_entry_value_show(struct dpipe_ctx *ctx,
- struct nlattr **nla_match_value)
+enum dpipe_value_type {
+   DPIPE_VALUE_TYPE_VALUE,
+   DPIPE_VALUE_TYPE_MASK,
+};
+
+static const char *
+dpipe_value_type_e2s(enum dpipe_value_type type)
+{
+   switch (type) {
+   case DPIPE_VALUE_TYPE_VALUE:
+   return "value";
+   case DPIPE_VALUE_TYPE_MASK:
+   return "value_mask";
+   default:
+   return "";
+   }
+}
+
+struct dpipe_field_printer {
+   unsigned int field_id;
+   void (*printer)(struct dpipe_ctx *, enum dpipe_value_type, void *);
+};
+
+struct dpipe_header_printer {
+   struct dpipe_field_printer *printers;
+   unsigned int printers_count;
+   unsigned int header_id;
+};
+
+static struct dpipe_header_printer *dpipe_header_printers[] = {};
+
+static int dpipe_print_prot_header(struct dpipe_ctx *ctx,
+  struct dpipe_op_info *info,
+  enum dpipe_value_type type,
+  void *value)
 {
+   unsigned int header_printers_count = ARRAY_SIZE(dpipe_header_printers);
+   struct dpipe_header_printer *header_printer;
+   struct dpipe_field_printer *field_printer;
+   unsigned int field_printers_count;
+   int j;
+   int i;
+
+   for (i = 0; i < header_printers_count; i++) {
+   header_printer = dpipe_header_printers[i];
+   if (header_printer->header_id != info->header_id)
+   continue;
+   field_printers_count = header_printer->printers_count;
+   for (j = 0; j < field_printers_count; j++) {
+   field_printer = _printer->printers[j];
+   if (field_printer->field_id != info->field_id)
+   continue;
+   field_printer->printer(ctx, type, value);
+   return 0;
+   }
+   }
+
+   return -EINVAL;
+}
+
+static void __pr_out_entry_value(struct dpipe_ctx *ctx,
+void *value,
+unsigned int value_len,
+struct dpipe_op_info *info,
+enum dpipe_value_type type)
+{
+   if (info->header_global &&
+   !dpipe_print_prot_header(ctx, info, type, value))
+   return;
+
+   if (value_len == sizeof(uint32_t)) {
+   uint32_t *value_32 = value;
+
+   pr_out_uint(ctx->dl, dpipe_value_type_e2s(type), *value_32);
+   }
+}
+
+static void pr_out_dpipe_entry_value(struct dpipe_ctx *ctx,
+struct nlattr **nla_match_value,
+struct dpipe_op_info *info)
+{
+   void *value, *value_mask;
+   uint32_t value_mapping;
uint16_t value_len;
bool mask, mapping;
 
@@ -3382,27 +3462,20 @@ static int dpipe_entry_value_show(struct dpipe_ctx *ctx,
mapping = !!nla_match_value[DEVLINK_ATTR_DPIPE_VALUE_MAPPING];
 
value_len = 
mnl_attr_get_payload_len(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE]);
-   if (value_len == sizeof(uint32_t)) {
-   uint32_t value, value_mask, value_mapping;
-
-   if (mapping) {
-   value_mapping = 
mnl_attr_get_u32(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE_MAPPING]);
-   pr_out_uint(ctx->dl, "mapping_value", value_mapping);
-   }
-
-   if (mask) {
-   value_mask = 
mnl_attr_get_u32(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE_MASK]);
-   pr_out_uint(ctx->dl, "mask_value", value_mask);
-   }
+   value = mnl_attr_get_payload(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE]);
 
-   value = 
mnl_attr_get_u32(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE]);
-   pr_out_uint(ctx->dl, "value", value);
+   if (mapping) {
+   value_mapping = 
mnl_attr_get_u32(nla_match_value[DEVLINK_ATTR_DPIPE_VALUE_MAPPING]);
+   pr_out_uint(ctx->dl, "mappin

Re: [patch net-next 0/8] mlxsw: Add IPv6 host dpipe table

2017-08-31 Thread Arkadi Sharshevsky


On 08/30/2017 08:26 PM, Andrew Lunn wrote:
> On Wed, Aug 30, 2017 at 02:02:58PM +0200, Jiri Pirko wrote:
>> From: Jiri Pirko 
>>
>> Arkadi says:
>>
>> This patchset adds IPv6 host dpipe table support. This will provide the
>> ability to observe the hardware offloaded IPv6 neighbors.
> 
> Hi Jiri, Arkadi
> 
> Could you give us an example of the output seen in user space.
> 
> Thanks
>   Andrew
> 

Yeah Sure,

Andrew, please note that the output can be done in JSON format with -j
-p flags so you always can expose multiple tables, and in user space
easily present the data as 2D array as you wish. This is just formatting

This is example for IPV4/6 host (neighbor) tables with two entries. The
table is an hash table which performs match on rif and dst_ip, then sets
the dst mac. The rif is an internal object which is mapped to ifindex.

This is an example for the IPv4:

$devlink dpipe table dump pci/:03:00.0 name mlxsw_host4
pci/:03:00.0:
  index 0
  match_value:
type field_exact header mlxsw_meta field erif_port mapping ifindex
mapping_value 732 value 0
type field_exact header ipv4 field destination ip value 20.0.0.1
  action_value:
type field_modify header ethernet field destination_mac value
e4:1d:2d:a5:f0:4a

  index 1
  match_value:
type field_exact header mlxsw_meta field erif_port mapping ifindex
mapping_value 733 value 1
type field_exact header ipv4 field destination ip value 10.0.0.1
  action_value:
type field_modify header ethernet field destination_mac value
e4:1d:2d:a5:f1:7e

This is an example for IPv6:

$devlink dpipe table dump pci/:03:00.0 name mlxsw_host6
pci/:03:00.0:
  index 0
  match_value:
type field_exact header mlxsw_meta field erif_port mapping ifindex
mapping_value 732 value 0
type field_exact header ipv6 field destination ip value
2001:2011:0:f101::2
  action_value:
type field_modify header ethernet field destination_mac value
e4:1d:2d:a5:f0:4a

  index 1
  match_value:
type field_exact header mlxsw_meta field erif_port mapping ifindex
mapping_value 733 value 1
type field_exact header ipv6 field destination ip value
2001:1011:0:f101::2
  action_value:
type field_modify header ethernet field destination_mac value
e4:1d:2d:a5:f1:7e


Thanks,
Arkadi


Re: mlxsw and rtnl lock

2017-08-29 Thread Arkadi Sharshevsky


On 08/29/2017 11:04 PM, David Ahern wrote:
> On 8/29/17 12:10 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 08/28/2017 09:00 PM, David Ahern wrote:
>>> On 8/26/17 11:04 AM, Ido Schimmel wrote:
>>>> Regarding the silent abort, that's intentional. You can look at the same
>>>> code in v4.9 - when the chain was still blocking - and you'll see that
>>>> we didn't propagate the error even then. This was discussed in the past
>>>> and the conclusion was that user doesn't expect to operation to fail. If
>>>> hardware resources are exceeded, we let the kernel take care of the
>>>> forwarding instead.
>>>>
>>>
>>> In addition to Roopa's comments... The silent abort is not a good user
>>> experience. Right now it's add a network address or route, cross fingers
>>> and hope it does not overflow some limit (nexthop, ecmp, neighbor,
>>> prefix, etc) that triggers the offload abort.
>>>
>>> The mlxsw driver queries for some limits (e.g., max rifs) but I don't
>>> see any query related to current usage, and there is no API to pass any
>>> of that data to user space so user space has no programmatic way to
>>> handle this. I realize you are aware of this limitation. The point is to
>>> emphasize the need to resolve this.
>>>
>>
>> We actually thought about providing he user some tools to understand
>> the ASIC's limitations by introducing the 'resource' object to devlink.
>>
>> By linking dpipe tables to resources the user can understand which
>> hardware processes share a common resource, furthermore this resources
>> usage could be observed. By this more visibility can be obtained.
>>
>> Its not a remedy for the silent abort, but, maybe a notification
>> can be sent from devlink in case of abort that some resources is
>> full.
>>
>> This proposition was sent as RFC several weeks ago.
>>
> 
> Do you have patches (kernel and devlink) for the proposal?
> 

No, only the design RFC which describe the UAPI, devlink
commands and the devlink/driver interactions.

I wanted to receive some feedback before the coding.


Re: mlxsw and rtnl lock

2017-08-29 Thread Arkadi Sharshevsky


On 08/28/2017 09:00 PM, David Ahern wrote:
> On 8/26/17 11:04 AM, Ido Schimmel wrote:
>> Regarding the silent abort, that's intentional. You can look at the same
>> code in v4.9 - when the chain was still blocking - and you'll see that
>> we didn't propagate the error even then. This was discussed in the past
>> and the conclusion was that user doesn't expect to operation to fail. If
>> hardware resources are exceeded, we let the kernel take care of the
>> forwarding instead.
>>
> 
> In addition to Roopa's comments... The silent abort is not a good user
> experience. Right now it's add a network address or route, cross fingers
> and hope it does not overflow some limit (nexthop, ecmp, neighbor,
> prefix, etc) that triggers the offload abort.
> 
> The mlxsw driver queries for some limits (e.g., max rifs) but I don't
> see any query related to current usage, and there is no API to pass any
> of that data to user space so user space has no programmatic way to
> handle this. I realize you are aware of this limitation. The point is to

The first dpipe table that was introduced was the erif table.
which gathers L3 statistics.

The rifs are actually also fixed size resource, so maybe it is
more correct to introduce it as 'resources' and connect it to
the erif table. That way you will be able to obtain current
usage, and receive notification when it will be drained out.

> emphasize the need to resolve this.
> 


Re: [PATCH net-next v2 00/10] net: dsa: add generic debugfs interface

2017-08-29 Thread Arkadi Sharshevsky


On 08/29/2017 03:50 PM, Andrew Lunn wrote:
> On Tue, Aug 29, 2017 at 08:25:23AM +0200, Jiri Pirko wrote:
>> Mon, Aug 28, 2017 at 10:08:34PM CEST, and...@lunn.ch wrote:
 I see this overlaps a lot with DPIPE. Why won't you use that to expose
 your hw state?
>>>
>>> We took a look at dpipe and i talked to you about using it for this
>>> sort of thing at netconf/netdev. But dpipe has issues displaying the
>>> sort of information we have. I never figured out how to do two
>>> dimensional tables. The output of the dpipe command is pretty
>>> unreadable. A lot of the information being dumped here is not about
>>> the data pipe, etc.
>>
>> So improve it. No problem. Also, we extend it to support what you neede.
> 
> Will i did try to do this back in March. And i failed.
> 
> Lets start with stats. Vivien gives an example on the cover letter:
> 
> # pr -mt switch0/port{5,6}/stats
> in_good_octets  : 0 in_good_octets  : 13824
> in_bad_octets   : 0 in_bad_octets   : 0
> in_unicast  : 0 in_unicast  : 0
> in_broadcasts   : 0 in_broadcasts   : 216
> in_multicasts   : 0 in_multicasts   : 0
> in_pause: 0 in_pause: 0
> in_undersize: 0 in_undersize: 0
> 
> This is what i tried to implement using dpipe. It is a simple two
> dimensional table. First column is a string, second a u64. In debugfs
> we have such a table per port. That fits with the hierarchy that each
> port is a directory in debugfs. But it could also be implemented as
> one table with N+1 columns, for N switch ports.
>

Hi Andrew,

This looks to me like basic L2 statistics that are obtained via
ethtool, I remember you had this problem with the DSA and CPU port.
Is this still the case?

I remembered we wanted to use dpipe for the DSA routing table
and IP priority table.

I think both those processes really look like match/action table
, thus they can be modeled successfully by dpipe.

> How about you, or one of your team, implement that. It should be able
> to use the dsa_loop driver, which is a simple dummy switch. But it
> does have statistics counters for all ports. Florian or I can help you
> get it running if needed.
> 
> This branch contains some of the basic plumbing code from my previous
> attempt:
> 
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Flunn%2Flinux.git=02%7C01%7Carkadis%40mellanox.com%7Cb3cac139af204f79259c08d4eedc8410%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636396078291326351=K5D3TAb2spckuF5k88oOaVt0dmtHj0AwE8bEEGPPdGI%3D=0
>  v4.11-rc4-net-next-dpipe
> 
>Andrew
> 


Re: mlxsw and rtnl lock

2017-08-29 Thread Arkadi Sharshevsky


On 08/28/2017 09:00 PM, David Ahern wrote:
> On 8/26/17 11:04 AM, Ido Schimmel wrote:
>> Regarding the silent abort, that's intentional. You can look at the same
>> code in v4.9 - when the chain was still blocking - and you'll see that
>> we didn't propagate the error even then. This was discussed in the past
>> and the conclusion was that user doesn't expect to operation to fail. If
>> hardware resources are exceeded, we let the kernel take care of the
>> forwarding instead.
>>
> 
> In addition to Roopa's comments... The silent abort is not a good user
> experience. Right now it's add a network address or route, cross fingers
> and hope it does not overflow some limit (nexthop, ecmp, neighbor,
> prefix, etc) that triggers the offload abort.
> 
> The mlxsw driver queries for some limits (e.g., max rifs) but I don't
> see any query related to current usage, and there is no API to pass any
> of that data to user space so user space has no programmatic way to
> handle this. I realize you are aware of this limitation. The point is to
> emphasize the need to resolve this.
> 

We actually thought about providing he user some tools to understand
the ASIC's limitations by introducing the 'resource' object to devlink.

By linking dpipe tables to resources the user can understand which
hardware processes share a common resource, furthermore this resources
usage could be observed. By this more visibility can be obtained.

Its not a remedy for the silent abort, but, maybe a notification
can be sent from devlink in case of abort that some resources is
full.

This proposition was sent as RFC several weeks ago.








Re: [PATCH net] bridge: check for null fdb->dst before notifying switchdev drivers

2017-08-27 Thread Arkadi Sharshevsky


On 08/27/2017 07:13 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> current switchdev drivers dont seem to support offloading fdb
> entries pointing to the bridge device which have fdb->dst
> not set to any port. This patch adds a NULL fdb->dst check in
> the switchdev notifier code.
> 
> This patch fixes the below NULL ptr dereference:
> $bridge fdb add 00:02:00:00:00:33 dev br0 self
> 
> [   69.953374] BUG: unable to handle kernel NULL pointer dereference at
> 0008
> [   69.954044] IP: br_switchdev_fdb_notify+0x29/0x80
> [   69.954044] PGD 66527067
> [   69.954044] P4D 66527067
> [   69.954044] PUD 7899c067
> [   69.954044] PMD 0
> [   69.954044]
> [   69.954044] Oops:  [#1] SMP
> [   69.954044] Modules linked in:
> [   69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1
> [   69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
> 04/01/2014
> [   69.954044] task: 88007b827140 task.stack: c90001564000
> [   69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80
> [   69.954044] RSP: 0018:c90001567918 EFLAGS: 00010246
> [   69.954044] RAX:  RBX: 8800795e0880 RCX:
> 00c0
> [   69.954044] RDX: c90001567920 RSI: 001c RDI:
> 8800795d0600
> [   69.954044] RBP: c90001567938 R08: 8800795d0600 R09:
> 
> [   69.954044] R10: c90001567a88 R11: 88007b849400 R12:
> 8800795e0880
> [   69.954044] R13: 8800795d0600 R14: 81ef8880 R15:
> 001c
> [   69.954044] FS:  7f93d3085700() GS:88007fd0()
> knlGS:
> [   69.954044] CS:  0010 DS:  ES:  CR0: 80050033
> [   69.954044] CR2: 0008 CR3: 66551000 CR4:
> 06e0
> [   69.954044] Call Trace:
> [   69.954044]  fdb_notify+0x3f/0xf0
> [   69.954044]  __br_fdb_add.isra.12+0x1a7/0x370
> [   69.954044]  br_fdb_add+0x178/0x280
> [   69.954044]  rtnl_fdb_add+0x10a/0x200
> [   69.954044]  rtnetlink_rcv_msg+0x1b4/0x240
> [   69.954044]  ? skb_free_head+0x21/0x40
> [   69.954044]  ? rtnl_calcit.isra.18+0xf0/0xf0
> [   69.954044]  netlink_rcv_skb+0xed/0x120
> [   69.954044]  rtnetlink_rcv+0x15/0x20
> [   69.954044]  netlink_unicast+0x180/0x200
> [   69.954044]  netlink_sendmsg+0x291/0x370
> [   69.954044]  ___sys_sendmsg+0x180/0x2e0
> [   69.954044]  ? filemap_map_pages+0x2db/0x370
> [   69.954044]  ? do_wp_page+0x11d/0x420
> [   69.954044]  ? __handle_mm_fault+0x794/0xd80
> [   69.954044]  ? vma_link+0xcb/0xd0
> [   69.954044]  __sys_sendmsg+0x4c/0x90
> [   69.954044]  SyS_sendmsg+0x12/0x20
> [   69.954044]  do_syscall_64+0x63/0xe0
> [   69.954044]  entry_SYSCALL64_slow_path+0x25/0x25
> [   69.954044] RIP: 0033:0x7f93d2bad690
> [   69.954044] RSP: 002b:7ffc7217a638 EFLAGS: 0246 ORIG_RAX:
> 002e
> [   69.954044] RAX: ffda RBX: 7ffc72182eac RCX:
> 7f93d2bad690
> [   69.954044] RDX:  RSI: 7ffc7217a670 RDI:
> 0003
> [   69.954044] RBP: 59a1f7f8 R08: 0006 R09:
> 000a
> [   69.954044] R10: 7ffc7217a400 R11: 0246 R12:
> 7ffc7217a670
> [   69.954044] R13: 7ffc72182a98 R14: 006114c0 R15:
> 7ffc72182aa0
> [   69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47
> 20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d
> 55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00
> [   69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP:
> c90001567918
> [   69.954044] CR2: 0008
> [   69.954044] ---[ end trace 03e9eec4a82c238b ]---
> 
> Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about 
> FDB add/del")
> Signed-off-by: Roopa Prabhu 
> ---
>  net/bridge/br_switchdev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
> index 181a44d..f6b1c7d 100644
> --- a/net/bridge/br_switchdev.c
> +++ b/net/bridge/br_switchdev.c
> @@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const 
> unsigned char *mac,
>  void
>  br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
>  {
> - if (!fdb->added_by_user)
> + if (!fdb->added_by_user || !fdb->dst)
>   return;
>  
>   switch (type) {
> 

Thanks, missed that.
Arkadi


Re: [patch net-next 11/12] mlxsw: spectrum_dpipe: Add support for IPv4 host table dump

2017-08-27 Thread Arkadi Sharshevsky


On 08/25/2017 10:51 PM, David Ahern wrote:
> On 8/25/17 2:26 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 08/24/2017 10:26 PM, David Ahern wrote:
>>> On 8/23/17 11:40 PM, Jiri Pirko wrote:
>>>> +static int
>>>> +mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
>>>> +struct devlink_dpipe_entry *entry,
>>>> +bool counters_enabled,
>>>> +struct devlink_dpipe_dump_ctx *dump_ctx,
>>>> +int type)
>>>> +{
>>>> +  int rif_neigh_count = 0;
>>>> +  int rif_neigh_skip = 0;
>>>> +  int neigh_count = 0;
>>>> +  int rif_count;
>>>> +  int i, j;
>>>> +  int err;
>>>> +
>>>> +  rtnl_lock();
>>>
>>> Why does a h/w driver dumping its tables need the rtnl lock?
>>>
>>
>> This table represents the hw IPv4 arp table, and the
>> driver depends on rtnl to be held.
>>
> 
> Meaning mlxsw does not have its own locks protecting data structures --
> e.g., rif adds and deletes, so it is relying on rtnl?
> 
> Also, this dpipe capability seems to be just dumping data structures
> maintained by the driver. ie., you can compare the mlxsw view of
> networking state to IPv4 and IPv6 level tables. Any plans to offer a
> command that reads data from the h/w and passes that back to the user?
> i.e, a command to compare kernel tables to h/w state?
> 

So this infra should provide several things-

1) Reveal the interactions between various hardware tables
2) Counters for this tables
3) Debugabillity

The first two can be achieved right now. Regarding debugabillity, which
is a bit vague, the current assumption is that the drivers internal data
structures are synced with hardware (which is no always true), and maybe
are not synced with the kernel, so this can be achieved right now by
dumping the internal state of the driver. Furthermore, the counters are
dumped from the hardware and give the user additional indication.

I completely agree that the hardware should be dumped in order to
validate the internal data structures are really synced with HW. This
could be usable for observing data corruptions inside the ASIC and
various complex bugs.

In order to address that I though about maybe add a flag called
"validate_hw" so that during the dump the driver<-->hw state could be
validated.

What do you think about it?

Thanks,
Arkadi


Re: [net-next:master 1324/1341] drivers/net//ethernet/mellanox/mlxsw/spectrum_dpipe.c:323:9: error: too few arguments to function 'devlink_dpipe_table_register'

2017-08-25 Thread Arkadi Sharshevsky


On 08/25/2017 04:11 AM, David Miller wrote:
> From: kbuild test robot 
> Date: Fri, 25 Aug 2017 08:03:28 +0800
> 
>> All errors (new ones prefixed by >>):
>>
>>drivers/net//ethernet/mellanox/mlxsw/spectrum_dpipe.c: In function 
>> 'mlxsw_sp_dpipe_erif_table_init':
 drivers/net//ethernet/mellanox/mlxsw/spectrum_dpipe.c:323:9: error: too 
 few arguments to function 'devlink_dpipe_table_register'
>>  return devlink_dpipe_table_register(devlink,
>> ^
>>In file included from 
>> drivers/net//ethernet/mellanox/mlxsw/spectrum_dpipe.c:36:0:
>>include/net/devlink.h:401:1: note: declared here
>> devlink_dpipe_table_register(struct devlink *devlink,
>> ^
>>drivers/net//ethernet/mellanox/mlxsw/spectrum_dpipe.c:327:1: warning: 
>> control reaches end of non-void function [-Wreturn-type]
>> }
>> ^
> 
> I'll push the following fix into net-next for this:
> 
> 
> From 790c6056686cc4dd5b149b330bbd5ae208d4d721 Mon Sep 17 00:00:00 2001
> From: "David S. Miller" 
> Date: Thu, 24 Aug 2017 18:10:46 -0700
> Subject: [PATCH] devlink: Fix devlink_dpipe_table_register() stub signature.
> 
> One too many arguments compared to the non-stub version.
> 
> Reported-by: kbuild test robot 
> Fixes: ffd3cdccf214 ("devlink: Add support for dynamic table size")
> Signed-off-by: David S. Miller 
> ---
>  include/net/devlink.h | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/include/net/devlink.h b/include/net/devlink.h
> index 047a0c54f652..aaf7178127a2 100644
> --- a/include/net/devlink.h
> +++ b/include/net/devlink.h
> @@ -402,8 +402,7 @@ static inline int
>  devlink_dpipe_table_register(struct devlink *devlink,
>const char *table_name,
>struct devlink_dpipe_table_ops *table_ops,
> -  void *priv, u64 size,
> -  bool counter_control_extern)
> +  void *priv, bool counter_control_extern)
>  {
>   return 0;
>  }
> 

Thank you, sorry for the mistake.


Re: [patch net-next 11/12] mlxsw: spectrum_dpipe: Add support for IPv4 host table dump

2017-08-25 Thread Arkadi Sharshevsky


On 08/24/2017 10:26 PM, David Ahern wrote:
> On 8/23/17 11:40 PM, Jiri Pirko wrote:
>> +static int
>> +mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
>> +  struct devlink_dpipe_entry *entry,
>> +  bool counters_enabled,
>> +  struct devlink_dpipe_dump_ctx *dump_ctx,
>> +  int type)
>> +{
>> +int rif_neigh_count = 0;
>> +int rif_neigh_skip = 0;
>> +int neigh_count = 0;
>> +int rif_count;
>> +int i, j;
>> +int err;
>> +
>> +rtnl_lock();
> 
> Why does a h/w driver dumping its tables need the rtnl lock?
> 

This table represents the hw IPv4 arp table, and the
driver depends on rtnl to be held.


Re: Driver profiles RFC

2017-08-09 Thread Arkadi Sharshevsky


On 08/08/2017 07:08 PM, Roopa Prabhu wrote:
> On Tue, Aug 8, 2017 at 6:15 AM, Arkadi Sharshevsky <arka...@mellanox.com> 
> wrote:
>> Drivers may require driver specific information during the init stage.
>> For example, memory based shared resource which should be segmented for
>> different ASIC processes, such as FDB and LPM lookups.
>>
>> The current mlxsw implementation assumes some default values, which are
>> const and cannot be changed due to lack of UAPI for its configuration
>> (module params is not an option). Those values can greatly impact the
>> scale of the hardware processes, such as the maximum sizes of the FDB/LPM
>> tables. Furthermore, those values should be consistent between driver
>> reloads.
>>
>> The interface called DPIPE [1] was introduced in order to provide
>> abstraction of the hardware pipeline. This RFC letter suggests solving
>> this problem by enhancing the DPIPE hardware abstraction model.
>>
>> DPIPE Resource
>> ==
>>
>> In order to represent ASIC wide resources space a new object should be
>> introduced called "resource". It was originally suggested as future
>> extension in [1] in order to give the user visibility about the tables
>> limitation due to some shared resource. For example FDB and LPM share
>> a common hash based memory. This abstraction can be also used for
>> providing static configuration for such resources.
>>
>> Resource
>> 
>> The resource object defines generic hardware resource like memory,
>> counter pool, etc. which can be described by name and size. The resource
>> can be nested, for example the internal ASIC's memory can be split into
>> two parts, as can be seen in the following diagram:
>>
>> +---+
>> |  Internal Mem |
>> |   |
>> |   Size: 3M*   |
>> +---+
>>   /   \
>>  / \
>> /   \
>>/ \
>>   /   \
>>  +--+  +--+
>>  |Linear|  | Hash |
>>  |  |  |  |
>>  |   Size: 1M   |  |   Size: 2M   |
>>  +--+  +--+
>>
>> *The number are provided as an example and do not reflect real ASIC
>>  resource sizes
>>
>> Where the hash portion is used for FDB/LPM table lookups, and the linear
>> one is used by the routing adjacency table. Each resource can be described
>> by a name, size and list of children. Example for dumping the described
>> above structure:
>>
>> #devlink dpipe resource dump tree pci/:03:00.0 Mem
>> {
>> "resource": {
>>"pci/:03:00.0": [{
>> "name": "Mem",
>> "size": 3M,
>> "resource": [{
>>   "name": "Mem_Linear",
>>   "size": "1M",
>>  }, {
>>   "name": "Mem_Hash",
>>   "size": "2MK",
>>  }
>>   }]
>> }]
>>  }
>> }
>>
>> Each DPIPE table can be connected to one resource.
>>
>> Driver <--> Devlink API
>> ===
>> Each driver will register his resources with default values at init in
>> a similar way to DPIPE table registration. In case those resources already
>> exist the default values are discarded. The user will be able to dump and
>> update the resources. In order for the changes to take place the user will
>> need to re-initiate the driver by a specific devlink knob.
>>
>> The above described procedure will require extra reload of the driver.
>> This can be improved as a future optimization.
>>
>> UAPI
>> 
>> The user will be able to update the resources on a per resource basis:
>>
>> $devlink dpipe resource set pci/:03:00.0 Mem_Linear 2M
>>
>> For some resources the size is fixed, for example the size of the internal
>> memory cannot be changed. It is provided merely in order to reflect the
>> nested structure of the resource and to imply the user that Mem = Linear +
>> Hash, thus a set operation on it w

Re: Driver profiles RFC

2017-08-08 Thread Arkadi Sharshevsky


On 08/08/2017 04:54 PM, Andrew Lunn wrote:
> On Tue, Aug 08, 2017 at 04:15:41PM +0300, Arkadi Sharshevsky wrote:
>> Drivers may require driver specific information during the init stage.
>> For example, memory based shared resource which should be segmented for
>> different ASIC processes, such as FDB and LPM lookups.
> 
> Hi Arkadi
> 
> Have you looked around other subsystems to see if they have already
> solved this problem?
> 

One obvious possible solution which other subsystems use is module prams,
which is not acceptable.

> How about GPUs? Do they have a similar requirement?

Seems they are using module params. Furthermore, I checked the DRM API
and such a feature is not supported.

> 
> This seems like a generic problem for 'smart' peripherals. How would
> you use dpipe with a GPU for example?
> 
> Andrew
> 

Thanks for the review.
Arkadi


Driver profiles RFC

2017-08-08 Thread Arkadi Sharshevsky
Drivers may require driver specific information during the init stage.
For example, memory based shared resource which should be segmented for
different ASIC processes, such as FDB and LPM lookups.

The current mlxsw implementation assumes some default values, which are
const and cannot be changed due to lack of UAPI for its configuration
(module params is not an option). Those values can greatly impact the
scale of the hardware processes, such as the maximum sizes of the FDB/LPM
tables. Furthermore, those values should be consistent between driver
reloads.

The interface called DPIPE [1] was introduced in order to provide
abstraction of the hardware pipeline. This RFC letter suggests solving
this problem by enhancing the DPIPE hardware abstraction model.

DPIPE Resource
==

In order to represent ASIC wide resources space a new object should be
introduced called "resource". It was originally suggested as future
extension in [1] in order to give the user visibility about the tables
limitation due to some shared resource. For example FDB and LPM share
a common hash based memory. This abstraction can be also used for
providing static configuration for such resources.

Resource

The resource object defines generic hardware resource like memory,
counter pool, etc. which can be described by name and size. The resource
can be nested, for example the internal ASIC's memory can be split into
two parts, as can be seen in the following diagram:

+---+
|  Internal Mem |
|   |
|   Size: 3M*   |
+---+
  /   \
 / \
/   \
   / \
  /   \
 +--+  +--+
 |Linear|  | Hash |
 |  |  |  |
 |   Size: 1M   |  |   Size: 2M   |
 +--+  +--+

*The number are provided as an example and do not reflect real ASIC
 resource sizes

Where the hash portion is used for FDB/LPM table lookups, and the linear
one is used by the routing adjacency table. Each resource can be described
by a name, size and list of children. Example for dumping the described
above structure:

#devlink dpipe resource dump tree pci/:03:00.0 Mem
{
"resource": {
   "pci/:03:00.0": [{
"name": "Mem",
"size": 3M,
"resource": [{
  "name": "Mem_Linear",
  "size": "1M",
 }, {
  "name": "Mem_Hash",
  "size": "2MK",
 }
  }]
}]
 }
}

Each DPIPE table can be connected to one resource.

Driver <--> Devlink API
===
Each driver will register his resources with default values at init in
a similar way to DPIPE table registration. In case those resources already
exist the default values are discarded. The user will be able to dump and
update the resources. In order for the changes to take place the user will
need to re-initiate the driver by a specific devlink knob.

The above described procedure will require extra reload of the driver.
This can be improved as a future optimization.

UAPI

The user will be able to update the resources on a per resource basis:

$devlink dpipe resource set pci/:03:00.0 Mem_Linear 2M

For some resources the size is fixed, for example the size of the internal
memory cannot be changed. It is provided merely in order to reflect the
nested structure of the resource and to imply the user that Mem = Linear +
Hash, thus a set operation on it will fail.

The user can dump the current resource configuration:

#devlink dpipe resource dump tree pci/:03:00.0 Mem

The user can specify 'tree' in order to show all the nested resources under
the specified one. In case no 'resource name' is specified the TOP hierarchy
will be dumped.

After successful resource update the drivers hould be re-instantiated in
order for the changes to take place:

$devlink reload pci/:03:00.0

User Configuration
--
Such an UAPI is very low level, and thus an average user may not know how to
adjust this sizes according to his needs. The vendor can provide several
tested configuration files that the user can choose from. Each config file
will be measured in terms of: MAC addresses, L3 Neighbors (IPv4, IPv6),
LPM entries (IPv4,IPv6) in order to provide approximate results. By this an
average user will choose one of the provided ones. Furthermore, a more
advanced user could play with the numbers for his personal benefit.

Reference
=
[1] https://netdevconf.org/2.1/papers/dpipe_netdev_2_1.odt



[PATCH net-next v3 10/13] net: dsa: Remove redundant MDB dump support

2017-08-06 Thread Arkadi Sharshevsky
Currently the MDB HW database is synced with the bridge's one, thus,
There is no need to support special dump functionality.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 include/net/dsa.h  |  4 
 net/dsa/dsa_priv.h |  2 --
 net/dsa/port.c | 11 ---
 net/dsa/slave.c|  3 ---
 4 files changed, 20 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d7b9bdd..4ef1859 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -400,10 +400,6 @@ struct dsa_switch_ops {
struct switchdev_trans *trans);
int (*port_mdb_del)(struct dsa_switch *ds, int port,
const struct switchdev_obj_port_mdb *mdb);
-   int (*port_mdb_dump)(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb);
-
/*
 * RXNFC
 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 77ddec4..3207071 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -141,8 +141,6 @@ int dsa_port_mdb_add(struct dsa_port *dp,
 struct switchdev_trans *trans);
 int dsa_port_mdb_del(struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb);
 int dsa_port_vlan_add(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan,
  struct switchdev_trans *trans);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ce19216..7378782 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -210,17 +210,6 @@ int dsa_port_mdb_del(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, );
 }
 
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_switch *ds = dp->ds;
-
-   if (ds->ops->port_mdb_dump)
-   return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb);
-
-   return -EOPNOTSUPP;
-}
-
 int dsa_port_vlan_add(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan,
  struct switchdev_trans *trans)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5b37298..8482060 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -299,9 +299,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_FDB:
err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb);
break;
-   case SWITCHDEV_OBJ_ID_PORT_MDB:
-   err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
-   break;
default:
err = -EOPNOTSUPP;
break;
-- 
2.4.11



[PATCH net-next v3 09/13] net: dsa: Remove support for MDB dump from DSA's drivers

2017-08-06 Thread Arkadi Sharshevsky
This is done as a preparation before removing support for MDB dump from
DSA core. The MDBs are synced with the bridge and thus there is no
need for special dump operation support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 drivers/net/dsa/microchip/ksz_common.c |  9 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 24 
 2 files changed, 33 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index a53ce59..4de9d90 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1020,14 +1020,6 @@ static int ksz_port_mdb_del(struct dsa_switch *ds, int 
port,
return ret;
 }
 
-static int ksz_port_mdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_mdb *mdb,
-switchdev_obj_dump_cb_t *cb)
-{
-   /* this is not called by switch layer */
-   return 0;
-}
-
 static int ksz_port_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror,
   bool ingress)
@@ -1090,7 +1082,6 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_mdb_prepare   = ksz_port_mdb_prepare,
.port_mdb_add   = ksz_port_mdb_add,
.port_mdb_del   = ksz_port_mdb_del,
-   .port_mdb_dump  = ksz_port_mdb_dump,
.port_mirror_add= ksz_port_mirror_add,
.port_mirror_del= ksz_port_mirror_del,
 };
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 5bb1138..1f5c202 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1414,15 +1414,6 @@ static int mv88e6xxx_port_db_dump_fid(struct 
mv88e6xxx_chip *chip,
fdb->ndm_state = NUD_NOARP;
else
fdb->ndm_state = NUD_REACHABLE;
-   } else if (obj->id == SWITCHDEV_OBJ_ID_PORT_MDB) {
-   struct switchdev_obj_port_mdb *mdb;
-
-   if (!is_multicast_ether_addr(addr.mac))
-   continue;
-
-   mdb = SWITCHDEV_OBJ_PORT_MDB(obj);
-   mdb->vid = vid;
-   ether_addr_copy(mdb->addr, addr.mac);
} else {
return -EOPNOTSUPP;
}
@@ -3800,20 +3791,6 @@ static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, 
int port,
return err;
 }
 
-static int mv88e6xxx_port_mdb_dump(struct dsa_switch *ds, int port,
-  struct switchdev_obj_port_mdb *mdb,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct mv88e6xxx_chip *chip = ds->priv;
-   int err;
-
-   mutex_lock(>reg_lock);
-   err = mv88e6xxx_port_db_dump(chip, port, >obj, cb);
-   mutex_unlock(>reg_lock);
-
-   return err;
-}
-
 static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
.probe  = mv88e6xxx_drv_probe,
.get_tag_protocol   = mv88e6xxx_get_tag_protocol,
@@ -3847,7 +3824,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = 
{
.port_mdb_prepare   = mv88e6xxx_port_mdb_prepare,
.port_mdb_add   = mv88e6xxx_port_mdb_add,
.port_mdb_del   = mv88e6xxx_port_mdb_del,
-   .port_mdb_dump  = mv88e6xxx_port_mdb_dump,
.crosschip_bridge_join  = mv88e6xxx_crosschip_bridge_join,
.crosschip_bridge_leave = mv88e6xxx_crosschip_bridge_leave,
 };
-- 
2.4.11



[PATCH net-next v3 02/13] net: dsa: Remove prepare phase for FDB

2017-08-06 Thread Arkadi Sharshevsky
The prepare phase for FDB add is unneeded because most of DSA devices
can have failures during bus transactions (SPI, I2C, etc.), thus, the
prepare phase cannot guarantee success of the commit stage.

The support for learning FDB through notification chain, which will be
introduced in the following patches, will provide the ability to notify
back the bridge about successful offload.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c   | 17 +++--
 drivers/net/dsa/b53/b53_priv.h |  6 ++
 drivers/net/dsa/bcm_sf2.c  |  1 -
 drivers/net/dsa/microchip/ksz_common.c | 24 ++--
 drivers/net/dsa/mt7530.c   | 25 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 23 +++
 drivers/net/dsa/qca8k.c| 18 +-
 include/net/dsa.h  |  4 +---
 net/dsa/dsa_priv.h |  4 +---
 net/dsa/port.c |  4 +---
 net/dsa/slave.c|  4 +++-
 net/dsa/switch.c   | 14 +++---
 12 files changed, 36 insertions(+), 108 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 5336179..3cf4f0a 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1213,8 +1213,8 @@ static int b53_arl_op(struct b53_device *dev, int op, int 
port,
return b53_arl_rw_op(dev, 0);
 }
 
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid)
+int b53_fdb_add(struct dsa_switch *ds, int port,
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
@@ -1224,17 +1224,7 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
if (is5325(priv) || is5365(priv))
return -EOPNOTSUPP;
 
-   return 0;
-}
-EXPORT_SYMBOL(b53_fdb_prepare);
-
-void b53_fdb_add(struct dsa_switch *ds, int port,
-const unsigned char *addr, u16 vid)
-{
-   struct b53_device *priv = ds->priv;
-
-   if (b53_arl_op(priv, 0, port, addr, vid, true))
-   pr_err("%s: failed to add MAC address\n", __func__);
+   return b53_arl_op(priv, 0, port, addr, vid, true);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
@@ -1563,7 +1553,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
.port_vlan_dump = b53_vlan_dump,
-   .port_fdb_prepare   = b53_fdb_prepare,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index d417bca..f29c892 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -396,10 +396,8 @@ int b53_vlan_del(struct dsa_switch *ds, int port,
 int b53_vlan_dump(struct dsa_switch *ds, int port,
  struct switchdev_obj_port_vlan *vlan,
  switchdev_obj_dump_cb_t *cb);
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid);
-void b53_fdb_add(struct dsa_switch *ds, int port,
-const unsigned char *addr, u16 vid);
+int b53_fdb_add(struct dsa_switch *ds, int port,
+   const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 558667c..1907b27 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1022,7 +1022,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
.port_vlan_dump = b53_vlan_dump,
-   .port_fdb_prepare   = b53_fdb_prepare,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index db82808..b55f364 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -678,14 +678,6 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int 
port,
return err;
 }
 
-static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid)
-{
-   /* nothing needed */
-
-   return 0;
-}
-
 struct alu_struct {
/* entry 1 */
u8  is_static:1;
@@ -705,12 +697,13 @@ struc

[PATCH net-next v3 07/13] net: dsa: Remove support for vlan dump from DSA's drivers

2017-08-06 Thread Arkadi Sharshevsky
This is done as a preparation before removing support for vlan dump from
DSA core. The vlans are synced with the bridge and thus there is no
need for special dump operation support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 drivers/net/dsa/b53/b53_common.c   | 44 --
 drivers/net/dsa/b53/b53_priv.h |  3 --
 drivers/net/dsa/bcm_sf2.c  |  1 -
 drivers/net/dsa/dsa_loop.c | 38 ---
 drivers/net/dsa/microchip/ksz_common.c | 41 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 56 --
 6 files changed, 183 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 3cf4f0a..0176d80 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1053,49 +1053,6 @@ int b53_vlan_del(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL(b53_vlan_del);
 
-int b53_vlan_dump(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb)
-{
-   struct b53_device *dev = ds->priv;
-   u16 vid, vid_start = 0, pvid;
-   struct b53_vlan *vl;
-   int err = 0;
-
-   if (is5325(dev) || is5365(dev))
-   vid_start = 1;
-
-   b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), );
-
-   /* Use our software cache for dumps, since we do not have any HW
-* operation returning only the used/valid VLANs
-*/
-   for (vid = vid_start; vid < dev->num_vlans; vid++) {
-   vl = >vlans[vid];
-
-   if (!vl->valid)
-   continue;
-
-   if (!(vl->members & BIT(port)))
-   continue;
-
-   vlan->vid_begin = vlan->vid_end = vid;
-   vlan->flags = 0;
-
-   if (vl->untag & BIT(port))
-   vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-   if (pvid == vid)
-   vlan->flags |= BRIDGE_VLAN_INFO_PVID;
-
-   err = cb(>obj);
-   if (err)
-   break;
-   }
-
-   return err;
-}
-EXPORT_SYMBOL(b53_vlan_dump);
-
 /* Address Resolution Logic routines */
 static int b53_arl_op_wait(struct b53_device *dev)
 {
@@ -1552,7 +1509,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
.port_vlan_prepare  = b53_vlan_prepare,
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
-   .port_vlan_dump = b53_vlan_dump,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index f29c892..af5d6c1 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -393,9 +393,6 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
  struct switchdev_trans *trans);
 int b53_vlan_del(struct dsa_switch *ds, int port,
 const struct switchdev_obj_port_vlan *vlan);
-int b53_vlan_dump(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb);
 int b53_fdb_add(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 1907b27..bbcb405 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1021,7 +1021,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
.port_vlan_prepare  = b53_vlan_prepare,
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
-   .port_vlan_dump = b53_vlan_dump,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index fdd8f38..76d6660 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -257,43 +257,6 @@ static int dsa_loop_port_vlan_del(struct dsa_switch *ds, 
int port,
return 0;
 }
 
-static int dsa_loop_port_vlan_dump(struct dsa_switch *ds, int port,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_loop_priv *ps = ds->priv;
-   struct mii_bus *bus = ps->bus;
-   struct dsa_loop_vlan *vl;
-   u16 vid, vid_start = 0;
-   int err = 0;
-
-   dev_dbg(ds->dev, "%s\n", __func__);
-
-   /* Just do a sleeping operation to make lockdep checks effective */
-   mdiobus_read(bus, ps->port_base + port, MII_BMSR);
-
-   for (vid 

[PATCH net-next v3 12/13] net: bridge: Remove FDB deletion through switchdev object

2017-08-06 Thread Arkadi Sharshevsky
At this point no driver supports FDB add/del through switchdev object
but rather via notification chain, thus, it is removed.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/bridge/br_fdb.c | 18 --
 1 file changed, 18 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a5e4a73..a79b648 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -169,29 +169,11 @@ static void fdb_del_hw_addr(struct net_bridge *br, const 
unsigned char *addr)
}
 }
 
-static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
-{
-   struct switchdev_obj_port_fdb fdb = {
-   .obj = {
-   .orig_dev = f->dst->dev,
-   .id = SWITCHDEV_OBJ_ID_PORT_FDB,
-   .flags = SWITCHDEV_F_DEFER,
-   },
-   .vid = f->vlan_id,
-   };
-
-   ether_addr_copy(fdb.addr, f->addr.addr);
-   switchdev_port_obj_del(f->dst->dev, );
-}
-
 static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 {
if (f->is_static)
fdb_del_hw_addr(br, f->addr.addr);
 
-   if (f->added_by_external_learn)
-   fdb_del_external_learn(f);
-
hlist_del_init_rcu(>hlist);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(>rcu, fdb_rcu_free);
-- 
2.4.11



[PATCH net-next v3 01/13] net: dsa: Change DSA slave FDB API to be switchdev independent

2017-08-06 Thread Arkadi Sharshevsky
In order to support FDB add/del to be on a notifier chain the slave
API need to be changed to be switchdev independent.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c   | 12 +---
 drivers/net/dsa/b53/b53_priv.h |  8 +++-
 drivers/net/dsa/microchip/ksz_common.c | 34 --
 drivers/net/dsa/mt7530.c   | 14 ++
 drivers/net/dsa/mv88e6xxx/chip.c   | 12 +---
 drivers/net/dsa/qca8k.c| 15 ++-
 include/net/dsa.h  |  8 +++-
 net/dsa/switch.c   |  8 +---
 8 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 7f36d3e..5336179 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1214,8 +1214,7 @@ static int b53_arl_op(struct b53_device *dev, int op, int 
port,
 }
 
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans)
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
@@ -1230,22 +1229,21 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL(b53_fdb_prepare);
 
 void b53_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans)
+const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
-   if (b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, true))
+   if (b53_arl_op(priv, 0, port, addr, vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
 int b53_fdb_del(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb)
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
-   return b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, false);
+   return b53_arl_op(priv, 0, port, addr, vid, false);
 }
 EXPORT_SYMBOL(b53_fdb_del);
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 155a9c4..d417bca 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -397,13 +397,11 @@ int b53_vlan_dump(struct dsa_switch *ds, int port,
  struct switchdev_obj_port_vlan *vlan,
  switchdev_obj_dump_cb_t *cb);
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans);
+   const unsigned char *addr, u16 vid);
 void b53_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans);
+const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb);
+   const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
 struct switchdev_obj_port_fdb *fdb,
 switchdev_obj_dump_cb_t *cb);
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index b313ecd..db82808 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -679,8 +679,7 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int 
port,
 }
 
 static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans)
+   const unsigned char *addr, u16 vid)
 {
/* nothing needed */
 
@@ -707,8 +706,7 @@ struct alu_struct {
 };
 
 static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans)
+const unsigned char *addr, u16 vid)
 {
struct ksz_device *dev = ds->priv;
u32 alu_table[4];
@@ -717,12 +715,12 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int 
port,
mutex_lock(>alu_mutex);
 
/* find any entry with mac & vid */
-   data = fdb->vid << ALU_FID_INDEX_S;
-   data |= ((fdb->addr[0] << 8) | fdb->addr[1]);
+   data = vid << ALU_FID_INDEX_S;
+   data |= ((addr[0] << 8) | addr[1]);
ksz_write32(dev, REG_SW_ALU_INDEX_0, data);
 
-   data = ((fdb->addr[2] << 24) | (fdb-&

[PATCH net-next v3 05/13] net: dsa: Move FDB add/del implementation inside DSA

2017-08-06 Thread Arkadi Sharshevsky
Currently DSA uses switchdev's implementation of FDB add/del ndos. This
patch moves the implementation inside DSA in order to support the legacy
way for static FDB configuration.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 net/dsa/dsa_priv.h |  7 +++
 net/dsa/legacy.c   | 22 ++
 net/dsa/slave.c| 14 ++
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 5af25e7..dab10d5 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -112,6 +112,13 @@ bool dsa_schedule_work(struct work_struct *work);
 /* legacy.c */
 int dsa_legacy_register(void);
 void dsa_legacy_unregister(void);
+int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+  struct net_device *dev,
+  const unsigned char *addr, u16 vid,
+  u16 flags);
+int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+  struct net_device *dev,
+  const unsigned char *addr, u16 vid);
 
 /* port.c */
 int dsa_port_set_state(struct dsa_port *dp, u8 state,
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 1d7a328..578001f 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -741,6 +741,28 @@ static int dsa_resume(struct device *d)
 }
 #endif
 
+/* legacy way, bypassing the bridge */
+int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+  struct net_device *dev,
+  const unsigned char *addr, u16 vid,
+  u16 flags)
+{
+   struct dsa_slave_priv *p = netdev_priv(dev);
+   struct dsa_port *dp = p->dp;
+
+   return dsa_port_fdb_add(dp, addr, vid);
+}
+
+int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+  struct net_device *dev,
+  const unsigned char *addr, u16 vid)
+{
+   struct dsa_slave_priv *p = netdev_priv(dev);
+   struct dsa_port *dp = p->dp;
+
+   return dsa_port_fdb_del(dp, addr, vid);
+}
+
 static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
 
 static const struct of_device_id dsa_of_match_table[] = {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a2fbbc7..2cf1c94 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -250,12 +250,6 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 */
 
switch (obj->id) {
-   case SWITCHDEV_OBJ_ID_PORT_FDB:
-   if (switchdev_trans_ph_prepare(trans))
-   return 0;
-   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
-  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
-   break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
break;
@@ -279,10 +273,6 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
int err;
 
switch (obj->id) {
-   case SWITCHDEV_OBJ_ID_PORT_FDB:
-   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
-  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
-   break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
@@ -955,8 +945,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_change_rx_flags= dsa_slave_change_rx_flags,
.ndo_set_rx_mode= dsa_slave_set_rx_mode,
.ndo_set_mac_address= dsa_slave_set_mac_address,
-   .ndo_fdb_add= switchdev_port_fdb_add,
-   .ndo_fdb_del= switchdev_port_fdb_del,
+   .ndo_fdb_add= dsa_legacy_fdb_add,
+   .ndo_fdb_del= dsa_legacy_fdb_del,
.ndo_fdb_dump   = switchdev_port_fdb_dump,
.ndo_do_ioctl   = dsa_slave_ioctl,
.ndo_get_iflink = dsa_slave_get_iflink,
-- 
2.4.11



[PATCH net-next v3 08/13] net: dsa: Remove support for bypass bridge port attributes/vlan set

2017-08-06 Thread Arkadi Sharshevsky
The bridge port attributes/vlan for DSA devices should be set only
from bridge code. Furthermore, The vlans are synced totally with the
bridge so there is no need for special dump support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 include/net/dsa.h  |  4 
 net/dsa/dsa_priv.h |  4 
 net/dsa/port.c | 12 
 net/dsa/slave.c|  6 --
 4 files changed, 26 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 446fc43..d7b9bdd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -378,10 +378,6 @@ struct dsa_switch_ops {
 struct switchdev_trans *trans);
int (*port_vlan_del)(struct dsa_switch *ds, int port,
 const struct switchdev_obj_port_vlan *vlan);
-   int (*port_vlan_dump)(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb);
-
/*
 * Forwarding database
 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index dab10d5..77ddec4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -148,10 +148,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
  struct switchdev_trans *trans);
 int dsa_port_vlan_del(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_vlan_dump(struct dsa_port *dp,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb);
-
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 86e0585..ce19216 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -246,15 +246,3 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, );
 }
-
-int dsa_port_vlan_dump(struct dsa_port *dp,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_switch *ds = dp->ds;
-
-   if (ds->ops->port_vlan_dump)
-   return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb);
-
-   return -EOPNOTSUPP;
-}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9205fda..5b37298 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -302,9 +302,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
break;
-   case SWITCHDEV_OBJ_ID_PORT_VLAN:
-   err = dsa_port_vlan_dump(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), cb);
-   break;
default:
err = -EOPNOTSUPP;
break;
@@ -958,9 +955,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_netpoll_cleanup= dsa_slave_netpoll_cleanup,
.ndo_poll_controller= dsa_slave_poll_controller,
 #endif
-   .ndo_bridge_getlink = switchdev_port_bridge_getlink,
-   .ndo_bridge_setlink = switchdev_port_bridge_setlink,
-   .ndo_bridge_dellink = switchdev_port_bridge_dellink,
.ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
.ndo_setup_tc   = dsa_slave_setup_tc,
.ndo_get_stats64= dsa_slave_get_stats64,
-- 
2.4.11



[PATCH net-next v3 03/13] net: dsa: Remove switchdev dependency from DSA switch notifier chain

2017-08-06 Thread Arkadi Sharshevsky
Currently, the switchdev objects are embedded inside the DSA notifier
info. This patch removes this dependency. This is done as a preparation
stage before adding support for learning FDB through the switchdev
notification chain.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/dsa_priv.h | 11 ++-
 net/dsa/port.c | 15 +--
 net/dsa/slave.c|  6 --
 net/dsa/switch.c   | 11 ---
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 330078d..f9f5de0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -43,9 +43,10 @@ struct dsa_notifier_bridge_info {
 
 /* DSA_NOTIFIER_FDB_* */
 struct dsa_notifier_fdb_info {
-   const struct switchdev_obj_port_fdb *fdb;
int sw_index;
int port;
+   const unsigned char *addr;
+   u16 vid;
 };
 
 /* DSA_NOTIFIER_MDB_* */
@@ -121,10 +122,10 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool 
vlan_filtering,
struct switchdev_trans *trans);
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 struct switchdev_trans *trans);
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
 int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
  switchdev_obj_dump_cb_t *cb);
 int dsa_port_mdb_add(struct dsa_port *dp,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bd271b9..86e0585 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -146,25 +146,28 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t 
ageing_clock,
return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, );
 }
 
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, );
 }
 
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
+
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, );
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 4f06984..13c90e2 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -253,7 +253,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_FDB:
if (switchdev_trans_ph_prepare(trans))
return 0;
-   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
@@ -279,7 +280,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_FDB:
-   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index eb20e0f..e6c06aa 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,8 +83,6 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 static int dsa_switch_fdb_add(struct dsa_switch *ds,
  struct dsa_notifier_fdb_info *info)
 {
-   const struct switchdev_obj_port_fdb *fdb = info->fdb;
-
/* Do not care yet about other switch chips of the fabric */
if (ds->index != info->sw_index)
return 0;
@@ -92,14 +90,13 @@ static int ds

[PATCH net-next v3 06/13] net: dsa: Add support for querying supported bridge flags

2017-08-06 Thread Arkadi Sharshevsky
The DSA drivers do not support bridge flags offload. Yet, this attribute
should be added in order for the bridge to fail when one tries set a
flag on the port, as explained in commit dc0ecabd6231 ("net: switchdev:
Add support for querying supported bridge flags by hardware").

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/slave.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2cf1c94..9205fda 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -324,6 +324,9 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
attr->u.ppid.id_len = sizeof(ds->index);
memcpy(>u.ppid.id, >index, attr->u.ppid.id_len);
break;
+   case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
+   attr->u.brport_flags_support = 0;
+   break;
default:
return -EOPNOTSUPP;
}
-- 
2.4.11



[PATCH net-next v3 00/13] Update DSA's FDB API and perform switchdev cleanup

2017-08-06 Thread Arkadi Sharshevsky
The patchset adds support for configuring static FDB entries via the
switchdev notification chain. The current method for FDB configuration
uses the switchdev's bridge bypass implementation. In order to support 
this legacy way and to perform the switchdev cleanup, the implementation
is moved inside DSA.

The DSA drivers cannot sync the software bridge with hardware learned
entries and use the switchdev's implementation of bypass FDB dumping.
Because they are the only ones using this functionality, the fdb_dump
implementation is moved from switchdev code into DSA.

Finally after this changes a major cleanup in switchdev can be done.
---
Please see individual patches for patch specific change logs.
v1->v2
- Split MDB/vlan dump removal into core/driver removal.

v2->v3
- The self implementation for FDB add/del is moved inside DSA.

Arkadi Sharshevsky (13):
  net: dsa: Change DSA slave FDB API to be switchdev independent
  net: dsa: Remove prepare phase for FDB
  net: dsa: Remove switchdev dependency from DSA switch notifier chain
  net: dsa: Add support for learning FDB through notification
  net: dsa: Move FDB add/del implementation inside DSA
  net: dsa: Add support for querying supported bridge flags
  net: dsa: Remove support for vlan dump from DSA's drivers
  net: dsa: Remove support for bypass bridge port attributes/vlan set
  net: dsa: Remove support for MDB dump from DSA's drivers
  net: dsa: Remove redundant MDB dump support
  net: dsa: Move FDB dump implementation inside DSA
  net: bridge: Remove FDB deletion through switchdev object
  net: switchdev: Remove bridge bypass support from switchdev

 drivers/net/dsa/b53/b53_common.c   |  83 +-
 drivers/net/dsa/b53/b53_priv.h |  16 +-
 drivers/net/dsa/bcm_sf2.c  |   2 -
 drivers/net/dsa/dsa_loop.c |  38 ---
 drivers/net/dsa/microchip/ksz_common.c | 124 ++--
 drivers/net/dsa/mt7530.c   |  41 +--
 drivers/net/dsa/mv88e6xxx/chip.c   | 147 ++
 drivers/net/dsa/qca8k.c|  42 +--
 include/net/dsa.h  |  23 +-
 include/net/switchdev.h|  87 --
 net/bridge/br_fdb.c|  18 --
 net/dsa/dsa.c  |  13 +
 net/dsa/dsa_priv.h |  29 +-
 net/dsa/legacy.c   |  22 ++
 net/dsa/port.c |  51 +---
 net/dsa/slave.c| 249 +---
 net/dsa/switch.c   |  21 +-
 net/switchdev/switchdev.c  | 519 -
 18 files changed, 360 insertions(+), 1165 deletions(-)

-- 
2.4.11



[PATCH net-next v3 11/13] net: dsa: Move FDB dump implementation inside DSA

2017-08-06 Thread Arkadi Sharshevsky
>From all switchdev devices only DSA requires special FDB dump. This is due
to lack of ability for syncing the hardware learned FDBs with the bridge.
Due to this it is removed from switchdev and moved inside DSA.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
v1->v2
- Cosmetics. ndm_state->is_static.
---
 drivers/net/dsa/b53/b53_common.c   |  16 ++
 drivers/net/dsa/b53/b53_priv.h |   3 +-
 drivers/net/dsa/microchip/ksz_common.c |  20 ++-
 drivers/net/dsa/mt7530.c   |  10 +---
 drivers/net/dsa/mv88e6xxx/chip.c   |  38 -
 drivers/net/dsa/qca8k.c|  15 ++---
 include/net/dsa.h  |   5 +-
 include/net/switchdev.h|  12 
 net/dsa/dsa_priv.h |   2 -
 net/dsa/port.c |  11 
 net/dsa/slave.c| 100 +
 net/switchdev/switchdev.c  |  84 ---
 12 files changed, 112 insertions(+), 204 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 0176d80..274f367 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1227,8 +1227,7 @@ static void b53_arl_search_rd(struct b53_device *dev, u8 
idx,
 }
 
 static int b53_fdb_copy(int port, const struct b53_arl_entry *ent,
-   struct switchdev_obj_port_fdb *fdb,
-   switchdev_obj_dump_cb_t *cb)
+   dsa_fdb_dump_cb_t *cb, void *data)
 {
if (!ent->is_valid)
return 0;
@@ -1236,16 +1235,11 @@ static int b53_fdb_copy(int port, const struct 
b53_arl_entry *ent,
if (port != ent->port)
return 0;
 
-   ether_addr_copy(fdb->addr, ent->mac);
-   fdb->vid = ent->vid;
-   fdb->ndm_state = ent->is_static ? NUD_NOARP : NUD_REACHABLE;
-
-   return cb(>obj);
+   return cb(ent->mac, ent->vid, ent->is_static, data);
 }
 
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb)
+dsa_fdb_dump_cb_t *cb, void *data)
 {
struct b53_device *priv = ds->priv;
struct b53_arl_entry results[2];
@@ -1263,13 +1257,13 @@ int b53_fdb_dump(struct dsa_switch *ds, int port,
return ret;
 
b53_arl_search_rd(priv, 0, [0]);
-   ret = b53_fdb_copy(port, [0], fdb, cb);
+   ret = b53_fdb_copy(port, [0], cb, data);
if (ret)
return ret;
 
if (priv->num_arl_entries > 2) {
b53_arl_search_rd(priv, 1, [1]);
-   ret = b53_fdb_copy(port, [1], fdb, cb);
+   ret = b53_fdb_copy(port, [1], cb, data);
if (ret)
return ret;
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index af5d6c1..01bd8cb 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -398,8 +398,7 @@ int b53_fdb_add(struct dsa_switch *ds, int port,
 int b53_fdb_del(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb);
+dsa_fdb_dump_cb_t *cb, void *data);
 int b53_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 4de9d90..56cd6d3 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -805,12 +805,11 @@ static void convert_alu(struct alu_struct *alu, u32 
*alu_table)
 }
 
 static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb)
+dsa_fdb_dump_cb_t *cb, void *data)
 {
struct ksz_device *dev = ds->priv;
int ret = 0;
-   u32 data;
+   u32 ksz_data;
u32 alu_table[4];
struct alu_struct alu;
int timeout;
@@ -823,8 +822,8 @@ static int ksz_port_fdb_dump(struct dsa_switch *ds, int 
port,
do {
timeout = 1000;
do {
-   ksz_read32(dev, REG_SW_ALU_CTRL__4, );
-   if ((data & ALU_VALID) || !(data & ALU_START))
+   ksz_read32(dev, REG_SW_ALU_CTRL__4, _data);
+   if ((ksz_data & ALU_VALID) || !(ksz_data & ALU_START))
break;

[PATCH net-next v3 04/13] net: dsa: Add support for learning FDB through notification

2017-08-06 Thread Arkadi Sharshevsky
Add support for learning FDB through notification. The driver defers
the hardware update via ordered work queue. In case of a successful
FDB add a notification is sent back to bridge.

In case of hw FDB del failure the static FDB will be deleted from
the bridge, thus, the interface is moved to down state in order to
indicate inconsistent situation.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
v1->v2
- Moved dsa_schdule_work decleration into net/dsa/dsa_priv.h.
- Fixed switchdev nb un-registration.
---
 net/dsa/dsa.c  |  13 ++
 net/dsa/dsa_priv.h |   1 +
 net/dsa/slave.c| 127 -
 3 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 0ba842c..73145ea 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -280,10 +280,22 @@ static struct packet_type dsa_pack_type __read_mostly = {
.func   = dsa_switch_rcv,
 };
 
+static struct workqueue_struct *dsa_owq;
+
+bool dsa_schedule_work(struct work_struct *work)
+{
+   return queue_work(dsa_owq, work);
+}
+
 static int __init dsa_init_module(void)
 {
int rc;
 
+   dsa_owq = alloc_ordered_workqueue("dsa_ordered",
+ WQ_MEM_RECLAIM);
+   if (!dsa_owq)
+   return -ENOMEM;
+
rc = dsa_slave_register_notifier();
if (rc)
return rc;
@@ -303,6 +315,7 @@ static void __exit dsa_cleanup_module(void)
dsa_slave_unregister_notifier();
dev_remove_pack(_pack_type);
dsa_legacy_unregister();
+   destroy_workqueue(dsa_owq);
 }
 module_exit(dsa_cleanup_module);
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index f9f5de0..5af25e7 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -107,6 +107,7 @@ void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp);
 void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp);
+bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
 int dsa_legacy_register(void);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 13c90e2..a2fbbc7 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1295,19 +1295,142 @@ static int dsa_slave_netdevice_event(struct 
notifier_block *nb,
return NOTIFY_DONE;
 }
 
+struct dsa_switchdev_event_work {
+   struct work_struct work;
+   struct switchdev_notifier_fdb_info fdb_info;
+   struct net_device *dev;
+   unsigned long event;
+};
+
+static void dsa_slave_switchdev_event_work(struct work_struct *work)
+{
+   struct dsa_switchdev_event_work *switchdev_work =
+   container_of(work, struct dsa_switchdev_event_work, work);
+   struct net_device *dev = switchdev_work->dev;
+   struct switchdev_notifier_fdb_info *fdb_info;
+   struct dsa_slave_priv *p = netdev_priv(dev);
+   int err;
+
+   rtnl_lock();
+   switch (switchdev_work->event) {
+   case SWITCHDEV_FDB_ADD_TO_DEVICE:
+   fdb_info = _work->fdb_info;
+   err = dsa_port_fdb_add(p->dp, fdb_info->addr, fdb_info->vid);
+   if (err) {
+   netdev_dbg(dev, "fdb add failed err=%d\n", err);
+   break;
+   }
+   call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
+_info->info);
+   break;
+
+   case SWITCHDEV_FDB_DEL_TO_DEVICE:
+   fdb_info = _work->fdb_info;
+   err = dsa_port_fdb_del(p->dp, fdb_info->addr, fdb_info->vid);
+   if (err) {
+   netdev_dbg(dev, "fdb del failed err=%d\n", err);
+   dev_close(dev);
+   }
+   break;
+   }
+   rtnl_unlock();
+
+   kfree(switchdev_work->fdb_info.addr);
+   kfree(switchdev_work);
+   dev_put(dev);
+}
+
+static int
+dsa_slave_switchdev_fdb_work_init(struct dsa_switchdev_event_work *
+ switchdev_work,
+ const struct switchdev_notifier_fdb_info *
+ fdb_info)
+{
+   memcpy(_work->fdb_info, fdb_info,
+  sizeof(switchdev_work->fdb_info));
+   switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
+   if (!switchdev_work->fdb_info.addr)
+   return -ENOMEM;
+   ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
+   fdb_info->addr);
+   return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_slave_switchdev_event(struct notifier_block *unused,
+unsigned long event, void *ptr)
+{
+   struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+   struct dsa_switchdev_event_work

[PATCH net-next v3 13/13] net: switchdev: Remove bridge bypass support from switchdev

2017-08-06 Thread Arkadi Sharshevsky
Currently the bridge port flags, vlans, FDBs and MDBs can be offloaded
through the bridge code, making the switchdev's SELF bridge bypass
implementation to be redundant. This implies several changes:
- No need for dump infra in switchdev, DSA's special case is handled
  privately.
- Remove obj_dump from switchdev_ops.
- FDBs are removed from obj_add/del routines, due to the fact that they
  are offloaded through the bridge notification chain.
- The switchdev_port_bridge_xx() and switchdev_port_fdb_xx() functions
  can be removed.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
v1->v2
- Fix typo in commit message.
---
 include/net/switchdev.h   |  75 
 net/switchdev/switchdev.c | 435 --
 2 files changed, 510 deletions(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2637a6..d767b79 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -74,7 +74,6 @@ struct switchdev_attr {
 enum switchdev_obj_id {
SWITCHDEV_OBJ_ID_UNDEFINED,
SWITCHDEV_OBJ_ID_PORT_VLAN,
-   SWITCHDEV_OBJ_ID_PORT_FDB,
SWITCHDEV_OBJ_ID_PORT_MDB,
 };
 
@@ -97,17 +96,6 @@ struct switchdev_obj_port_vlan {
 #define SWITCHDEV_OBJ_PORT_VLAN(obj) \
container_of(obj, struct switchdev_obj_port_vlan, obj)
 
-/* SWITCHDEV_OBJ_ID_PORT_FDB */
-struct switchdev_obj_port_fdb {
-   struct switchdev_obj obj;
-   unsigned char addr[ETH_ALEN];
-   u16 vid;
-   u16 ndm_state;
-};
-
-#define SWITCHDEV_OBJ_PORT_FDB(obj) \
-   container_of(obj, struct switchdev_obj_port_fdb, obj)
-
 /* SWITCHDEV_OBJ_ID_PORT_MDB */
 struct switchdev_obj_port_mdb {
struct switchdev_obj obj;
@@ -135,8 +123,6 @@ typedef int switchdev_obj_dump_cb_t(struct switchdev_obj 
*obj);
  * @switchdev_port_obj_add: Add an object to port (see switchdev_obj_*).
  *
  * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj_*).
- *
- * @switchdev_port_obj_dump: Dump port objects (see switchdev_obj_*).
  */
 struct switchdev_ops {
int (*switchdev_port_attr_get)(struct net_device *dev,
@@ -149,9 +135,6 @@ struct switchdev_ops {
  struct switchdev_trans *trans);
int (*switchdev_port_obj_del)(struct net_device *dev,
  const struct switchdev_obj *obj);
-   int (*switchdev_port_obj_dump)(struct net_device *dev,
-  struct switchdev_obj *obj,
-  switchdev_obj_dump_cb_t *cb);
 };
 
 enum switchdev_notifier_type {
@@ -189,25 +172,10 @@ int switchdev_port_obj_add(struct net_device *dev,
   const struct switchdev_obj *obj);
 int switchdev_port_obj_del(struct net_device *dev,
   const struct switchdev_obj *obj);
-int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
-   switchdev_obj_dump_cb_t *cb);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 struct switchdev_notifier_info *info);
-int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u32 filter_mask,
- int nlflags);
-int switchdev_port_bridge_setlink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_bridge_dellink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-  struct net_device *dev, const unsigned char *addr,
-  u16 vid, u16 nlm_flags);
-int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-  struct net_device *dev, const unsigned char *addr,
-  u16 vid);
 void switchdev_port_fwd_mark_set(struct net_device *dev,
 struct net_device *group_dev,
 bool joining);
@@ -246,13 +214,6 @@ static inline int switchdev_port_obj_del(struct net_device 
*dev,
return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_obj_dump(struct net_device *dev,
- const struct switchdev_obj *obj,
- switchdev_obj_dump_cb_t *cb)
-{
-   return -EOPNOTSUPP;
-}
-
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
return 0;
@@ -270,42 +231,6 @@ static inline int call_switchdev_notifiers(unsigned long 
val,
return NOTIFY_DONE;
 }
 
-static inline int switchdev_port_bridge_getlink(stru

Re: [PATCH net-next v2 00/13] Change DSA's FDB API and perform switchdev cleanup

2017-08-06 Thread Arkadi Sharshevsky


On 08/04/2017 06:29 PM, Vivien Didelot wrote:
> Hi Arkadi, Jiri,
> 
> Jiri Pirko  writes:
> 
>>> It seems impossible currently to move the self to be the default, and
>>> this introduces regression which you don't approve, so it seems few
>>> options left:
>>>
>>> a) Leave two ways to add fdb, through the bridge (by using the master
>>>   flag) which is introduced in this patchset, and by using the self
>>>   which is the legacy way. In this way no regression will be introduced,
>>>   yet, it feels confusing a bit. The benefit is that we (DSA/mlxsw)
>>>   will be synced.
>>> b) Leave only the self (which means removing patch no 4,5).
>>
>> I belive that option a) is the correct way to go. Introduction of self
>> inclusion was a mistake from the very beginning. I think that we should
>> just move one and correct this mistake.
>>
>> Vivien, any arguments against a)?
> 
> I do agree with a). Arkadi, when moving switchdev implementations inside
> of DSA core, can I ask you to move the ones considered as the legacy way
> into legacy.c and ideally comment it? Configuration from userspace is
> still very confusing and this will remind us to get rid of it one day.
> 
> 
> Thanks,
> 
> Vivien
> 

Yeah, no problem.

Thanks,
Arkadi


Re: [PATCH net-next v2 00/13] Change DSA's FDB API and perform switchdev cleanup

2017-08-03 Thread Arkadi Sharshevsky

[...]

>> Now we have the "offload" read only flag, which is good to inform about
>> a successfully programmed hardware, but adds another level of complexity
>> to understand the interaction with the hardware.
>>
>> I think iproute2 is getting more and more confusing. From what I
>> understood, respecting the "self" flag as described is not possible
>> anymore due to some retro-compatibility reasons.
>>
>> Also Linux must use the hardware as an accelerator (so "self" or
>> "offload" must be the default), and always fall back to software
>> otherwise, hence "master" do not make sense here.
>>
>> What do you think about this synopsis for bridge fdb add?
>>
>> # bridge fdb add LLADDR dev DEV [ offload { on | off } ]
>>
>> Where offload defaults to "on". This option should also be ported to
>> other offloaded features like MDB and VLAN. Even though this is a bit
>> out of scope of this patchset, do you think this is feasible?
>>
> 
> I agree completely that currently its confusing. The documentation
> should be updated for sure. I think that 'self' was primarily introduced
> (Commit 77162022a) for NIC embedded switches which are used for sriov, in
> that case the self is related to the internal eswitch, which completely
> diverge from the software one (clearly not swithcdev).
> 
> IMHO For switchdev devices 'self' should not be an option at all, or any
> other arg regarding hardware. Furthermore, the 'offload' flag should be
> only relevant during the dump as an indication to the user.
> 
> Unfortunately, the  lack of ability of syncing the sw with hw in DSA's
> case introduces a problem for indicating that the entries are only
> in hw, I mean marking it only as offloaded is not enough.

Hi,

It seems impossible currently to move the self to be the default, and
this introduces regression which you don't approve, so it seems few
options left:

a) Leave two ways to add fdb, through the bridge (by using the master
   flag) which is introduced in this patchset, and by using the self
   which is the legacy way. In this way no regression will be introduced,
   yet, it feels confusing a bit. The benefit is that we (DSA/mlxsw)
   will be synced.
b) Leave only the self (which means removing patch no 4,5).

In both cases the switchdev implementation of .ndo_fdb_add() will be
moved inside DSA in a similar way to the dump because its only used by
you.

Option b) actually turns this patchset into cosmetic one which does
only cleanup.

Thanks,
Arkadi






Re: [patch net-next 2/2] mlxsw: spectrum: Add support for access cable info via ethtool

2017-07-27 Thread Arkadi Sharshevsky


On 07/26/2017 05:18 PM, Andrew Lunn wrote:
>> +static int mlxsw_sp_get_module_info(struct net_device *netdev,
>> +struct ethtool_modinfo *modinfo)
>> +{
>> +struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
>> +u8 module_info[MLXSW_SP_EEPROM_MODULE_INFO_SIZE];
>> +u8 module_rev_id, module_id;
>> +unsigned int read_size;
>> +int err;
>> +
>> +err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, 0,
>> +   MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
>> +   module_info, _size);
>> +if (err)
>> +return err;
>> +
>> +if (read_size < MLXSW_SP_EEPROM_MODULE_INFO_SIZE)
>> +return -EIO;
>> +
>> +module_rev_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_REV_ID];
>> +module_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_ID];
>> +
>> +switch (module_id) {
>> +case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP:
>> +modinfo->type   = ETH_MODULE_SFF_8436;
>> +modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
>> +break;
>> +case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
>> +case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28:
>> +if (module_id  == MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28 ||
>> +module_rev_id >= MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636) {
>> +modinfo->type   = ETH_MODULE_SFF_8636;
>> +modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
>> +} else {
>> +modinfo->type   = ETH_MODULE_SFF_8436;
>> +modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
>> +}
>> +break;
>> +case MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP:
>> +modinfo->type   = ETH_MODULE_SFF_8472;
>> +modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
>> +break;
>> +default:
>> +return -EINVAL;
> 
> Hi Jiri
> 
> I remember seeing a few different implementations of this function in
> various drivers. Could you pull it out into a helper, passing in the
> array of bytes and mod info? bnxt, mlx4, mlx5, igb, ixgbe and sfc
> could then use the helper.
> 
>   Andrew
> 

Yeah, it seems generic for all modules, it can be moved into ethtool
I think.

Thanks,
Arkadi




Re: [PATCH iproute2] bridge: Assume master at FDB modification

2017-07-26 Thread Arkadi Sharshevsky


On 07/26/2017 07:08 PM, Stephen Hemminger wrote:
> On Wed, 26 Jul 2017 18:36:34 +0300
> Arkadi Sharshevsky <arka...@mellanox.com> wrote:
> 
>> According to the man page the master flag should be the default, yet, the
>> current code assumes otherwise.
>>
>> Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
> 
> Agree that the documentation and code don't match.
> But your change could break users with existing scripts by changing behavior.
> 
> It would be safer to change the man page not the code.
> 

Can we maybe set master and self by default. It doesn't make
sense by default to not include the bridge, it will not cause
regression in this case.


[PATCH iproute2] bridge: Assume master at FDB modification

2017-07-26 Thread Arkadi Sharshevsky
According to the man page the master flag should be the default, yet, the
current code assumes otherwise.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 bridge/fdb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index e5cebf9..7c77157 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -496,9 +496,9 @@ static int fdb_modify(int cmd, int flags, int argc, char 
**argv)
return -1;
}
 
-   /* Assume self */
+   /* Assume master */
if (!(req.ndm.ndm_flags&(NTF_SELF|NTF_MASTER)))
-   req.ndm.ndm_flags |= NTF_SELF;
+   req.ndm.ndm_flags |= NTF_MASTER;
 
/* Assume permanent */
if (!(req.ndm.ndm_state&(NUD_PERMANENT|NUD_REACHABLE)))
-- 
2.4.11



Re: [PATCH net-next v2 00/13] Change DSA's FDB API and perform switchdev cleanup

2017-07-23 Thread Arkadi Sharshevsky


On 07/20/2017 07:26 PM, Vivien Didelot wrote:
> Hi Arkadi,
> 
> Arkadi Sharshevsky <arka...@mellanox.com> writes:
> 
>> Hi, thanks for the test. If the fdb is marked as self its not in the
>> bridge at all. So before my patch it was OK because you supported the
>> self thing.
>>
>> Please notice that both fdbs you added are marked the same because the
>> default is self: vim bridge/fdb.c +499 (I think this is a bug because
>> the man page states that master is the default). So in order to put it
>> in the bridge you should specify "master":
>>
>> $bridge fdb add e4:1d:2d:a5:f0:4a dev sw1p7 master
>> $bridge fdb show brport sw1p7
>> e4:1d:2d:a5:f0:4a vlan 1 offload master br0 permanent <---also should
>> e4:1d:2d:46:13:f1 vlan 1 master br0 permanent be offloaded*
>> e4:1d:2d:46:13:f1 master br0 permanent
>> e4:1d:2d:a5:f0:4a offload master br0 permanent
>> 33:33:00:00:00:01 self permanent
>> 01:00:5e:00:00:01 self permanent
>> 33:33:ff:46:13:f1 self permanent
>>
>> *you should take the latest iproute.
> 
> Thanks for the explanation, it makes more sense now. Now with latest
> net-next iproute2 and your patch, I have this behavior, first before
> your patchset:
> 
> # bridge fdb add e4:1d:2d:a5:f0:2a dev lan3
> # bridge fdb add e4:1d:2d:a5:f0:4a dev lan4 master
> # bridge fdb show
> 01:00:5e:00:00:01 dev eth0 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> 2a:6e:b6:a8:25:f1 dev lan0 master br0 permanent
> e4:1d:2d:a5:f0:2a dev lan3 self static
> e4:1d:2d:a5:f0:4a dev lan4 master br0 permanent
> 01:00:5e:00:00:01 dev br0 self permanent
> # bridge fdb del e4:1d:2d:a5:f0:2a dev lan3
> # bridge fdb del e4:1d:2d:a5:f0:4a dev lan4 master
> # bridge fdb show
> 01:00:5e:00:00:01 dev eth0 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> 2a:6e:b6:a8:25:f1 dev lan0 master br0 permanent
> 01:00:5e:00:00:01 dev br0 self permanent
> 
> and after your patchset:
> 
> # bridge fdb add e4:1d:2d:a5:f0:2a dev lan3
> # bridge fdb add e4:1d:2d:a5:f0:4a dev lan4 master
> # bridge fdb show
> 01:00:5e:00:00:01 dev eth0 self permanent
> e4:1d:2d:a5:f0:2a dev eth1 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> da:ac:a3:36:f2:10 dev lan0 master br0 permanent
> e4:1d:2d:a5:f0:4a dev lan4 offload master br0 permanent
> e4:1d:2d:a5:f0:4a dev lan4 self static
> 01:00:5e:00:00:01 dev br0 self permanent
> # bridge fdb del e4:1d:2d:a5:f0:2a dev lan3
> # bridge fdb del e4:1d:2d:a5:f0:4a dev lan4 master
> # bridge fdb show
> 01:00:5e:00:00:01 dev eth0 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> da:ac:a3:36:f2:10 dev lan0 master br0 permanent
> 01:00:5e:00:00:01 dev br0 self permanent
> 
> For lan4, the behavior seems correct. Even if reporting "lan4 self
> static" seems odd and redundant with the above "lan4 offload master".

Yeah, but remember that because we didn't remove the fdb dump from DSA
the dump operation will dumps the bridge fdb and then dumps the DSA's
fdbs (via the .ndo_fdb_dump()). So we see it twice due to the hw
limitation for syncing the bridge.

> However, adding an address to lan3 without flag (hence self...) ends up
> in the switch master device (i.e. SoC side of the CPU port conduit.)
> 
> Do you have an idea why the patchset changes that?
> 

Yeah, think that I figured it out, after my patchset there is no DSA
driver implementation for ndo_fdb_add(), thus, the default
implementation will be called:

vim net/core/rtnetlink.c +3113

The default ndo_fdb_add() implementation does two things
(Commit 090096bf3):

1. Adds the address to the unicast list of the device.
2. Calls __dev_set_rx_mode.

In dsa, the set_rx_mode implementation, dsa_slave_set_rx_mode(),
will sync the master device with the addresses.

After that an fdb dump will show this address also on the master
device.

I think this default implementation is relevant mostly for nics
and is not relevant for switchdev net devices.

We can discuss how to bypass this default implementation in our
case because at least in case of mlxsw it does not make sense.
I can send it as followup patch.

>> Also it seems strange that I removed the self support from the driver
>> but you still managed to configure it. The reason is the default
>> self implementation:
>>
>> vim net/core/rtnetlink.c +3112
>>
>> I think it is relevant for NICs mostly, so we can ignore it.
> 
> Regardless your patchset, this is unfortunately inconsistent with the
> bridge man page, describing:
> 
> self - the address is assoc

Re: [PATCH net-next v2 00/13] Change DSA's FDB API and perform switchdev cleanup

2017-07-20 Thread Arkadi Sharshevsky


On 07/19/2017 11:17 PM, Vivien Didelot wrote:
> Hi Arkadi,
> 
> I am testing your patch series the behavior changes suspiciously:
> 
> # brctl show br0
> bridge name   bridge id   STP enabled interfaces
> br0   8000.f6d5ef06ccdd   no  lan0
> lan1
> lan2
> lan3
> lan4
> lan5
> lan6
> lan7
> lan8
> optical3
> optical4
> 
> Without the patchset I have this behavior:
> 
> # bridge fdb add 00:11:22:33:44:55 dev lan4
> # bridge fdb add 22:33:44:55:66:77 dev lan2 self
> # bridge fdb show   
> 01:00:5e:00:00:01 dev eth0 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> 0a:3f:f6:06:a2:ee dev lan0 master br0 permanent
> 22:33:44:55:66:77 dev lan2 self static
> 00:11:22:33:44:55 dev lan4 self static
> 01:00:5e:00:00:01 dev br0 self permanent
> 
> And now with the patchset applied I have:
> 
> # bridge fdb add 00:11:22:33:44:55 dev lan4
> # bridge fdb add 22:33:44:55:66:77 dev lan2 self
> # bridge fdb show
> 01:00:5e:00:00:01 dev eth0 self permanent
> 00:11:22:33:44:55 dev eth1 self permanent
> 22:33:44:55:66:77 dev eth1 self permanent
> 01:00:5e:00:00:01 dev eth1 self permanent
> 0a:ca:c8:6b:05:65 dev lan0 master br0 permanent
> 01:00:5e:00:00:01 dev br0 self permanent
> 
> 
> It looks like the FDB entries are reported to be associated with the
> master net device (eth1). Is the dump broken or is it the whole add?
> 
> Thanks,
> 
> Vivien
> 

Hi, thanks for the test. If the fdb is marked as self its not in the
bridge at all. So before my patch it was OK because you supported the
self thing.

Please notice that both fdbs you added are marked the same because the
default is self: vim bridge/fdb.c +499 (I think this is a bug because
the man page states that master is the default). So in order to put it
in the bridge you should specify "master":

$bridge fdb add e4:1d:2d:a5:f0:4a dev sw1p7 master
$bridge fdb show brport sw1p7
e4:1d:2d:a5:f0:4a vlan 1 offload master br0 permanent <---also should
e4:1d:2d:46:13:f1 vlan 1 master br0 permanent be offloaded*
e4:1d:2d:46:13:f1 master br0 permanent
e4:1d:2d:a5:f0:4a offload master br0 permanent
33:33:00:00:00:01 self permanent
01:00:5e:00:00:01 self permanent
33:33:ff:46:13:f1 self permanent

*you should take the latest iproute.

Also it seems strange that I removed the self support from the driver
but you still managed to configure it. The reason is the default
self implementation:

vim net/core/rtnetlink.c +3112

I think it is relevant for NICs mostly, so we can ignore it.







[PATCH net-next v2 00/13] Change DSA's FDB API and perform switchdev cleanup

2017-07-19 Thread Arkadi Sharshevsky
The patchset moves the DSA driver into learning static FDB entries via
the switchdev notification chain rather then by using bridge bypass SELF
flag. 

The DSA drivers cannot sync the software bridge with hardware learned
entries and use the switchdev's implementation of bypass FDB dumping.
Because they are the only ones using this functionality, the fdb_dump
implementation is moved from switchdev code into DSA.

Finally after this changes a major cleanup in switchdev can be done.
---
Please see individual patches for patch specific change logs.
v1->v2
- Split MDB/vlan dump removal into core/driver removal.

Arkadi Sharshevsky (13):
  net: dsa: Change DSA slave FDB API to be switchdev independent
  net: dsa: Remove prepare phase for FDB
  net: dsa: Remove switchdev dependency from DSA switch notifier chain
  net: dsa: Add support for learning FDB through notification
  net: dsa: Remove support for FDB add/del via SELF
  net: dsa: Add support for querying supported bridge flags
  net: dsa: Remove support for vlan dump from DSA's drivers
  net: dsa: Remove support for bypass bridge port attributes/vlan set
  net: dsa: Remove support for MDB dump from DSA's drivers
  net: dsa: Remove redundant MDB dump support
  net: dsa: Move FDB dump implementation inside DSA
  net: bridge: Remove FDB deletion through switchdev object
  net: switchdev: Remove bridge bypass support from switchdev

 drivers/net/dsa/b53/b53_common.c   |  83 +-
 drivers/net/dsa/b53/b53_priv.h |  16 +-
 drivers/net/dsa/bcm_sf2.c  |   2 -
 drivers/net/dsa/dsa_loop.c |  38 ---
 drivers/net/dsa/microchip/ksz_common.c | 124 ++--
 drivers/net/dsa/mt7530.c   |  41 +--
 drivers/net/dsa/mv88e6xxx/chip.c   | 147 ++
 drivers/net/dsa/qca8k.c|  42 +--
 include/net/dsa.h  |  23 +-
 include/net/switchdev.h|  87 --
 net/bridge/br_fdb.c|  18 --
 net/dsa/dsa.c  |  13 +
 net/dsa/dsa_priv.h |  22 +-
 net/dsa/port.c |  51 +---
 net/dsa/slave.c| 247 +---
 net/dsa/switch.c   |  21 +-
 net/switchdev/switchdev.c  | 519 -
 17 files changed, 329 insertions(+), 1165 deletions(-)

-- 
2.4.11



[PATCH net-next v2 08/13] net: dsa: Remove support for bypass bridge port attributes/vlan set

2017-07-19 Thread Arkadi Sharshevsky
The bridge port attributes/vlan for DSA devices should be set only
from bridge code. Furthermore, The vlans are synced totally with the
bridge so there is no need for special dump support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 include/net/dsa.h  |  4 
 net/dsa/dsa_priv.h |  4 
 net/dsa/port.c | 12 
 net/dsa/slave.c|  6 --
 4 files changed, 26 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index f054d41..4b82647 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -384,10 +384,6 @@ struct dsa_switch_ops {
 struct switchdev_trans *trans);
int (*port_vlan_del)(struct dsa_switch *ds, int port,
 const struct switchdev_obj_port_vlan *vlan);
-   int (*port_vlan_dump)(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb);
-
/*
 * Forwarding database
 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3ad666a..cddcea2 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -139,10 +139,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
  struct switchdev_trans *trans);
 int dsa_port_vlan_del(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_vlan_dump(struct dsa_port *dp,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb);
-
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 86e0585..ce19216 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -246,15 +246,3 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, );
 }
-
-int dsa_port_vlan_dump(struct dsa_port *dp,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_switch *ds = dp->ds;
-
-   if (ds->ops->port_vlan_dump)
-   return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb);
-
-   return -EOPNOTSUPP;
-}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3ad1f4d..f939d79 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -302,9 +302,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
break;
-   case SWITCHDEV_OBJ_ID_PORT_VLAN:
-   err = dsa_port_vlan_dump(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), cb);
-   break;
default:
err = -EOPNOTSUPP;
break;
@@ -926,9 +923,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_netpoll_cleanup= dsa_slave_netpoll_cleanup,
.ndo_poll_controller= dsa_slave_poll_controller,
 #endif
-   .ndo_bridge_getlink = switchdev_port_bridge_getlink,
-   .ndo_bridge_setlink = switchdev_port_bridge_setlink,
-   .ndo_bridge_dellink = switchdev_port_bridge_dellink,
.ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
.ndo_setup_tc   = dsa_slave_setup_tc,
 };
-- 
2.4.11



[PATCH net-next v2 02/13] net: dsa: Remove prepare phase for FDB

2017-07-19 Thread Arkadi Sharshevsky
The prepare phase for FDB add is unneeded because most of DSA devices
can have failures during bus transactions (SPI, I2C, etc.), thus, the
prepare phase cannot guarantee success of the commit stage.

The support for learning FDB through notification chain, which will be
introduced in the following patches, will provide the ability to notify
back the bridge about successful offload.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c   | 17 +++--
 drivers/net/dsa/b53/b53_priv.h |  6 ++
 drivers/net/dsa/bcm_sf2.c  |  1 -
 drivers/net/dsa/microchip/ksz_common.c | 24 ++--
 drivers/net/dsa/mt7530.c   | 25 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 23 +++
 drivers/net/dsa/qca8k.c| 18 +-
 include/net/dsa.h  |  4 +---
 net/dsa/dsa_priv.h |  4 +---
 net/dsa/port.c |  4 +---
 net/dsa/slave.c|  4 +++-
 net/dsa/switch.c   | 14 +++---
 12 files changed, 36 insertions(+), 108 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index d0156dc..c414b43 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1213,8 +1213,8 @@ static int b53_arl_op(struct b53_device *dev, int op, int 
port,
return b53_arl_rw_op(dev, 0);
 }
 
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid)
+int b53_fdb_add(struct dsa_switch *ds, int port,
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
@@ -1224,17 +1224,7 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
if (is5325(priv) || is5365(priv))
return -EOPNOTSUPP;
 
-   return 0;
-}
-EXPORT_SYMBOL(b53_fdb_prepare);
-
-void b53_fdb_add(struct dsa_switch *ds, int port,
-const unsigned char *addr, u16 vid)
-{
-   struct b53_device *priv = ds->priv;
-
-   if (b53_arl_op(priv, 0, port, addr, vid, true))
-   pr_err("%s: failed to add MAC address\n", __func__);
+   return b53_arl_op(priv, 0, port, addr, vid, true);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
@@ -1563,7 +1553,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
.port_vlan_dump = b53_vlan_dump,
-   .port_fdb_prepare   = b53_fdb_prepare,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index d417bca..f29c892 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -396,10 +396,8 @@ int b53_vlan_del(struct dsa_switch *ds, int port,
 int b53_vlan_dump(struct dsa_switch *ds, int port,
  struct switchdev_obj_port_vlan *vlan,
  switchdev_obj_dump_cb_t *cb);
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid);
-void b53_fdb_add(struct dsa_switch *ds, int port,
-const unsigned char *addr, u16 vid);
+int b53_fdb_add(struct dsa_switch *ds, int port,
+   const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 648f91b..a26e99d 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1034,7 +1034,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
.port_vlan_dump = b53_vlan_dump,
-   .port_fdb_prepare   = b53_fdb_prepare,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index db82808..b55f364 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -678,14 +678,6 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int 
port,
return err;
 }
 
-static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid)
-{
-   /* nothing needed */
-
-   return 0;
-}
-
 struct alu_struct {
/* entry 1 */
u8  is_static:1;
@@ -705,12 +697,13 @@ struc

[PATCH net-next v2 07/13] net: dsa: Remove support for vlan dump from DSA's drivers

2017-07-19 Thread Arkadi Sharshevsky
This is done as a preparation before removing support for vlan dump from
DSA core. The vlans are synced with the bridge and thus there is no
need for special dump operation support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 drivers/net/dsa/b53/b53_common.c   | 44 --
 drivers/net/dsa/b53/b53_priv.h |  3 --
 drivers/net/dsa/bcm_sf2.c  |  1 -
 drivers/net/dsa/dsa_loop.c | 38 ---
 drivers/net/dsa/microchip/ksz_common.c | 41 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 56 --
 6 files changed, 183 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index c414b43..6020e88 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1053,49 +1053,6 @@ int b53_vlan_del(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL(b53_vlan_del);
 
-int b53_vlan_dump(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb)
-{
-   struct b53_device *dev = ds->priv;
-   u16 vid, vid_start = 0, pvid;
-   struct b53_vlan *vl;
-   int err = 0;
-
-   if (is5325(dev) || is5365(dev))
-   vid_start = 1;
-
-   b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), );
-
-   /* Use our software cache for dumps, since we do not have any HW
-* operation returning only the used/valid VLANs
-*/
-   for (vid = vid_start; vid < dev->num_vlans; vid++) {
-   vl = >vlans[vid];
-
-   if (!vl->valid)
-   continue;
-
-   if (!(vl->members & BIT(port)))
-   continue;
-
-   vlan->vid_begin = vlan->vid_end = vid;
-   vlan->flags = 0;
-
-   if (vl->untag & BIT(port))
-   vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-   if (pvid == vid)
-   vlan->flags |= BRIDGE_VLAN_INFO_PVID;
-
-   err = cb(>obj);
-   if (err)
-   break;
-   }
-
-   return err;
-}
-EXPORT_SYMBOL(b53_vlan_dump);
-
 /* Address Resolution Logic routines */
 static int b53_arl_op_wait(struct b53_device *dev)
 {
@@ -1552,7 +1509,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
.port_vlan_prepare  = b53_vlan_prepare,
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
-   .port_vlan_dump = b53_vlan_dump,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index f29c892..af5d6c1 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -393,9 +393,6 @@ void b53_vlan_add(struct dsa_switch *ds, int port,
  struct switchdev_trans *trans);
 int b53_vlan_del(struct dsa_switch *ds, int port,
 const struct switchdev_obj_port_vlan *vlan);
-int b53_vlan_dump(struct dsa_switch *ds, int port,
- struct switchdev_obj_port_vlan *vlan,
- switchdev_obj_dump_cb_t *cb);
 int b53_fdb_add(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index a26e99d..824a137 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1033,7 +1033,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
.port_vlan_prepare  = b53_vlan_prepare,
.port_vlan_add  = b53_vlan_add,
.port_vlan_del  = b53_vlan_del,
-   .port_vlan_dump = b53_vlan_dump,
.port_fdb_dump  = b53_fdb_dump,
.port_fdb_add   = b53_fdb_add,
.port_fdb_del   = b53_fdb_del,
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index fdd8f38..76d6660 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -257,43 +257,6 @@ static int dsa_loop_port_vlan_del(struct dsa_switch *ds, 
int port,
return 0;
 }
 
-static int dsa_loop_port_vlan_dump(struct dsa_switch *ds, int port,
-  struct switchdev_obj_port_vlan *vlan,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_loop_priv *ps = ds->priv;
-   struct mii_bus *bus = ps->bus;
-   struct dsa_loop_vlan *vl;
-   u16 vid, vid_start = 0;
-   int err = 0;
-
-   dev_dbg(ds->dev, "%s\n", __func__);
-
-   /* Just do a sleeping operation to make lockdep checks effective */
-   mdiobus_read(bus, ps->port_base + port, MII_BMSR);
-
-   for (vid 

[PATCH net-next v2 04/13] net: dsa: Add support for learning FDB through notification

2017-07-19 Thread Arkadi Sharshevsky
Add support for learning FDB through notification. The driver defers
the hardware update via ordered work queue. In case of a successful
FDB add a notification is sent back to bridge.

In case of hw FDB del failure the static FDB will be deleted from
the bridge, thus, the interface is moved to down state in order to
indicate inconsistent situation.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
v1->v2
- Moved dsa_schdule_work decleration into net/dsa/dsa_priv.h.
- Fixed switchdev nb un-registration.
---
 net/dsa/dsa.c  |  13 ++
 net/dsa/dsa_priv.h |   1 +
 net/dsa/slave.c| 127 -
 3 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 416ac4e..9abe6dc 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -271,10 +271,22 @@ static struct packet_type dsa_pack_type __read_mostly = {
.func   = dsa_switch_rcv,
 };
 
+static struct workqueue_struct *dsa_owq;
+
+bool dsa_schedule_work(struct work_struct *work)
+{
+   return queue_work(dsa_owq, work);
+}
+
 static int __init dsa_init_module(void)
 {
int rc;
 
+   dsa_owq = alloc_ordered_workqueue("dsa_ordered",
+ WQ_MEM_RECLAIM);
+   if (!dsa_owq)
+   return -ENOMEM;
+
rc = dsa_slave_register_notifier();
if (rc)
return rc;
@@ -294,6 +306,7 @@ static void __exit dsa_cleanup_module(void)
dsa_slave_unregister_notifier();
dev_remove_pack(_pack_type);
dsa_legacy_unregister();
+   destroy_workqueue(dsa_owq);
 }
 module_exit(dsa_cleanup_module);
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 2b2f124..3ad666a 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -105,6 +105,7 @@ void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp);
 void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp);
+bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
 int dsa_legacy_register(void);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 19395cc..f595133 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1263,19 +1263,142 @@ static int dsa_slave_netdevice_event(struct 
notifier_block *nb,
return NOTIFY_DONE;
 }
 
+struct dsa_switchdev_event_work {
+   struct work_struct work;
+   struct switchdev_notifier_fdb_info fdb_info;
+   struct net_device *dev;
+   unsigned long event;
+};
+
+static void dsa_slave_switchdev_event_work(struct work_struct *work)
+{
+   struct dsa_switchdev_event_work *switchdev_work =
+   container_of(work, struct dsa_switchdev_event_work, work);
+   struct net_device *dev = switchdev_work->dev;
+   struct switchdev_notifier_fdb_info *fdb_info;
+   struct dsa_slave_priv *p = netdev_priv(dev);
+   int err;
+
+   rtnl_lock();
+   switch (switchdev_work->event) {
+   case SWITCHDEV_FDB_ADD_TO_DEVICE:
+   fdb_info = _work->fdb_info;
+   err = dsa_port_fdb_add(p->dp, fdb_info->addr, fdb_info->vid);
+   if (err) {
+   netdev_dbg(dev, "fdb add failed err=%d\n", err);
+   break;
+   }
+   call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
+_info->info);
+   break;
+
+   case SWITCHDEV_FDB_DEL_TO_DEVICE:
+   fdb_info = _work->fdb_info;
+   err = dsa_port_fdb_del(p->dp, fdb_info->addr, fdb_info->vid);
+   if (err) {
+   netdev_dbg(dev, "fdb del failed err=%d\n", err);
+   dev_close(dev);
+   }
+   break;
+   }
+   rtnl_unlock();
+
+   kfree(switchdev_work->fdb_info.addr);
+   kfree(switchdev_work);
+   dev_put(dev);
+}
+
+static int
+dsa_slave_switchdev_fdb_work_init(struct dsa_switchdev_event_work *
+ switchdev_work,
+ const struct switchdev_notifier_fdb_info *
+ fdb_info)
+{
+   memcpy(_work->fdb_info, fdb_info,
+  sizeof(switchdev_work->fdb_info));
+   switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
+   if (!switchdev_work->fdb_info.addr)
+   return -ENOMEM;
+   ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
+   fdb_info->addr);
+   return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_slave_switchdev_event(struct notifier_block *unused,
+unsigned long event, void *ptr)
+{
+   struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+   struct dsa_switchdev_event_work

[PATCH net-next v2 01/13] net: dsa: Change DSA slave FDB API to be switchdev independent

2017-07-19 Thread Arkadi Sharshevsky
In order to support FDB add/del to be on a notifier chain the slave
API need to be changed to be switchdev independent.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
---
 drivers/net/dsa/b53/b53_common.c   | 12 +---
 drivers/net/dsa/b53/b53_priv.h |  8 +++-
 drivers/net/dsa/microchip/ksz_common.c | 34 --
 drivers/net/dsa/mt7530.c   | 14 ++
 drivers/net/dsa/mv88e6xxx/chip.c   | 12 +---
 drivers/net/dsa/qca8k.c| 15 ++-
 include/net/dsa.h  |  8 +++-
 net/dsa/switch.c   |  8 +---
 8 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index e68d368..d0156dc 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1214,8 +1214,7 @@ static int b53_arl_op(struct b53_device *dev, int op, int 
port,
 }
 
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans)
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
@@ -1230,22 +1229,21 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL(b53_fdb_prepare);
 
 void b53_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans)
+const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
-   if (b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, true))
+   if (b53_arl_op(priv, 0, port, addr, vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
 int b53_fdb_del(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb)
+   const unsigned char *addr, u16 vid)
 {
struct b53_device *priv = ds->priv;
 
-   return b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, false);
+   return b53_arl_op(priv, 0, port, addr, vid, false);
 }
 EXPORT_SYMBOL(b53_fdb_del);
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 155a9c4..d417bca 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -397,13 +397,11 @@ int b53_vlan_dump(struct dsa_switch *ds, int port,
  struct switchdev_obj_port_vlan *vlan,
  switchdev_obj_dump_cb_t *cb);
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans);
+   const unsigned char *addr, u16 vid);
 void b53_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans);
+const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb);
+   const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
 struct switchdev_obj_port_fdb *fdb,
 switchdev_obj_dump_cb_t *cb);
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index b313ecd..db82808 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -679,8 +679,7 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int 
port,
 }
 
 static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-   const struct switchdev_obj_port_fdb *fdb,
-   struct switchdev_trans *trans)
+   const unsigned char *addr, u16 vid)
 {
/* nothing needed */
 
@@ -707,8 +706,7 @@ struct alu_struct {
 };
 
 static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
-const struct switchdev_obj_port_fdb *fdb,
-struct switchdev_trans *trans)
+const unsigned char *addr, u16 vid)
 {
struct ksz_device *dev = ds->priv;
u32 alu_table[4];
@@ -717,12 +715,12 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int 
port,
mutex_lock(>alu_mutex);
 
/* find any entry with mac & vid */
-   data = fdb->vid << ALU_FID_INDEX_S;
-   data |= ((fdb->addr[0] << 8) | fdb->addr[1]);
+   data = vid << ALU_FID_INDEX_S;
+   data |= ((addr[0] << 8) | addr[1]);
ksz_write32(dev, REG_SW_ALU_INDEX_0, data);
 
-   data = ((fdb->addr[2] << 24) | (fdb-&

[PATCH net-next v2 05/13] net: dsa: Remove support for FDB add/del via SELF

2017-07-19 Thread Arkadi Sharshevsky
FDB add/del can be added via switchdev notification chain. Thus the support
for configuration via switchdev objects can be removed.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/slave.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f595133..6bd2d42 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -250,12 +250,6 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 */
 
switch (obj->id) {
-   case SWITCHDEV_OBJ_ID_PORT_FDB:
-   if (switchdev_trans_ph_prepare(trans))
-   return 0;
-   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
-  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
-   break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
break;
@@ -279,10 +273,6 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
int err;
 
switch (obj->id) {
-   case SWITCHDEV_OBJ_ID_PORT_FDB:
-   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
-  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
-   break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
@@ -925,8 +915,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_change_rx_flags= dsa_slave_change_rx_flags,
.ndo_set_rx_mode= dsa_slave_set_rx_mode,
.ndo_set_mac_address= dsa_slave_set_mac_address,
-   .ndo_fdb_add= switchdev_port_fdb_add,
-   .ndo_fdb_del= switchdev_port_fdb_del,
.ndo_fdb_dump   = switchdev_port_fdb_dump,
.ndo_do_ioctl   = dsa_slave_ioctl,
.ndo_get_iflink = dsa_slave_get_iflink,
-- 
2.4.11



[PATCH net-next v2 06/13] net: dsa: Add support for querying supported bridge flags

2017-07-19 Thread Arkadi Sharshevsky
The DSA drivers do not support bridge flags offload. Yet, this attribute
should be added in order for the bridge to fail when one tries set a
flag on the port, as explained in commit dc0ecabd6231 ("net: switchdev:
Add support for querying supported bridge flags by hardware").

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/slave.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6bd2d42..3ad1f4d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -324,6 +324,9 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
attr->u.ppid.id_len = sizeof(ds->index);
memcpy(>u.ppid.id, >index, attr->u.ppid.id_len);
break;
+   case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
+   attr->u.brport_flags_support = 0;
+   break;
default:
return -EOPNOTSUPP;
}
-- 
2.4.11



[PATCH net-next v2 13/13] net: switchdev: Remove bridge bypass support from switchdev

2017-07-19 Thread Arkadi Sharshevsky
Currently the bridge port flags, vlans, FDBs and MDBs can be offloaded
through the bridge code, making the switchdev's SELF bridge bypass
implementation to be redundant. This implies several changes:
- No need for dump infra in switchdev, DSA's special case is handled
  privately.
- Remove obj_dump from switchdev_ops.
- FDBs are removed from obj_add/del routines, due to the fact that they
  are offloaded through the bridge notification chain.
- The switchdev_port_bridge_xx() and switchdev_port_fdb_xx() functions
  can be removed.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
v1->v2
- Fix typo in commit message.
---
 include/net/switchdev.h   |  75 
 net/switchdev/switchdev.c | 435 --
 2 files changed, 510 deletions(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2637a6..d767b79 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -74,7 +74,6 @@ struct switchdev_attr {
 enum switchdev_obj_id {
SWITCHDEV_OBJ_ID_UNDEFINED,
SWITCHDEV_OBJ_ID_PORT_VLAN,
-   SWITCHDEV_OBJ_ID_PORT_FDB,
SWITCHDEV_OBJ_ID_PORT_MDB,
 };
 
@@ -97,17 +96,6 @@ struct switchdev_obj_port_vlan {
 #define SWITCHDEV_OBJ_PORT_VLAN(obj) \
container_of(obj, struct switchdev_obj_port_vlan, obj)
 
-/* SWITCHDEV_OBJ_ID_PORT_FDB */
-struct switchdev_obj_port_fdb {
-   struct switchdev_obj obj;
-   unsigned char addr[ETH_ALEN];
-   u16 vid;
-   u16 ndm_state;
-};
-
-#define SWITCHDEV_OBJ_PORT_FDB(obj) \
-   container_of(obj, struct switchdev_obj_port_fdb, obj)
-
 /* SWITCHDEV_OBJ_ID_PORT_MDB */
 struct switchdev_obj_port_mdb {
struct switchdev_obj obj;
@@ -135,8 +123,6 @@ typedef int switchdev_obj_dump_cb_t(struct switchdev_obj 
*obj);
  * @switchdev_port_obj_add: Add an object to port (see switchdev_obj_*).
  *
  * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj_*).
- *
- * @switchdev_port_obj_dump: Dump port objects (see switchdev_obj_*).
  */
 struct switchdev_ops {
int (*switchdev_port_attr_get)(struct net_device *dev,
@@ -149,9 +135,6 @@ struct switchdev_ops {
  struct switchdev_trans *trans);
int (*switchdev_port_obj_del)(struct net_device *dev,
  const struct switchdev_obj *obj);
-   int (*switchdev_port_obj_dump)(struct net_device *dev,
-  struct switchdev_obj *obj,
-  switchdev_obj_dump_cb_t *cb);
 };
 
 enum switchdev_notifier_type {
@@ -189,25 +172,10 @@ int switchdev_port_obj_add(struct net_device *dev,
   const struct switchdev_obj *obj);
 int switchdev_port_obj_del(struct net_device *dev,
   const struct switchdev_obj *obj);
-int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
-   switchdev_obj_dump_cb_t *cb);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 struct switchdev_notifier_info *info);
-int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u32 filter_mask,
- int nlflags);
-int switchdev_port_bridge_setlink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_bridge_dellink(struct net_device *dev,
- struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-  struct net_device *dev, const unsigned char *addr,
-  u16 vid, u16 nlm_flags);
-int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-  struct net_device *dev, const unsigned char *addr,
-  u16 vid);
 void switchdev_port_fwd_mark_set(struct net_device *dev,
 struct net_device *group_dev,
 bool joining);
@@ -246,13 +214,6 @@ static inline int switchdev_port_obj_del(struct net_device 
*dev,
return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_obj_dump(struct net_device *dev,
- const struct switchdev_obj *obj,
- switchdev_obj_dump_cb_t *cb)
-{
-   return -EOPNOTSUPP;
-}
-
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
return 0;
@@ -270,42 +231,6 @@ static inline int call_switchdev_notifiers(unsigned long 
val,
return NOTIFY_DONE;
 }
 
-static inline int switchdev_port_bridge_getlink(stru

[PATCH net-next v2 11/13] net: dsa: Move FDB dump implementation inside DSA

2017-07-19 Thread Arkadi Sharshevsky
>From all switchdev devices only DSA requires special FDB dump. This is due
to lack of ability for syncing the hardware learned FDBs with the bridge.
Due to this it is removed from switchdev and moved inside DSA.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
v1->v2
- Cosmetics. ndm_state->is_static.
---
 drivers/net/dsa/b53/b53_common.c   |  16 ++
 drivers/net/dsa/b53/b53_priv.h |   3 +-
 drivers/net/dsa/microchip/ksz_common.c |  20 ++-
 drivers/net/dsa/mt7530.c   |  10 +---
 drivers/net/dsa/mv88e6xxx/chip.c   |  38 -
 drivers/net/dsa/qca8k.c|  15 ++---
 include/net/dsa.h  |   5 +-
 include/net/switchdev.h|  12 
 net/dsa/dsa_priv.h |   2 -
 net/dsa/port.c |  11 
 net/dsa/slave.c| 100 +
 net/switchdev/switchdev.c  |  84 ---
 12 files changed, 112 insertions(+), 204 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 6020e88..28e06b6 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1227,8 +1227,7 @@ static void b53_arl_search_rd(struct b53_device *dev, u8 
idx,
 }
 
 static int b53_fdb_copy(int port, const struct b53_arl_entry *ent,
-   struct switchdev_obj_port_fdb *fdb,
-   switchdev_obj_dump_cb_t *cb)
+   dsa_fdb_dump_cb_t *cb, void *data)
 {
if (!ent->is_valid)
return 0;
@@ -1236,16 +1235,11 @@ static int b53_fdb_copy(int port, const struct 
b53_arl_entry *ent,
if (port != ent->port)
return 0;
 
-   ether_addr_copy(fdb->addr, ent->mac);
-   fdb->vid = ent->vid;
-   fdb->ndm_state = ent->is_static ? NUD_NOARP : NUD_REACHABLE;
-
-   return cb(>obj);
+   return cb(ent->mac, ent->vid, ent->is_static, data);
 }
 
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb)
+dsa_fdb_dump_cb_t *cb, void *data)
 {
struct b53_device *priv = ds->priv;
struct b53_arl_entry results[2];
@@ -1263,13 +1257,13 @@ int b53_fdb_dump(struct dsa_switch *ds, int port,
return ret;
 
b53_arl_search_rd(priv, 0, [0]);
-   ret = b53_fdb_copy(port, [0], fdb, cb);
+   ret = b53_fdb_copy(port, [0], cb, data);
if (ret)
return ret;
 
if (priv->num_arl_entries > 2) {
b53_arl_search_rd(priv, 1, [1]);
-   ret = b53_fdb_copy(port, [1], fdb, cb);
+   ret = b53_fdb_copy(port, [1], cb, data);
if (ret)
return ret;
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index af5d6c1..01bd8cb 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -398,8 +398,7 @@ int b53_fdb_add(struct dsa_switch *ds, int port,
 int b53_fdb_del(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb);
+dsa_fdb_dump_cb_t *cb, void *data);
 int b53_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 4de9d90..56cd6d3 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -805,12 +805,11 @@ static void convert_alu(struct alu_struct *alu, u32 
*alu_table)
 }
 
 static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_fdb *fdb,
-switchdev_obj_dump_cb_t *cb)
+dsa_fdb_dump_cb_t *cb, void *data)
 {
struct ksz_device *dev = ds->priv;
int ret = 0;
-   u32 data;
+   u32 ksz_data;
u32 alu_table[4];
struct alu_struct alu;
int timeout;
@@ -823,8 +822,8 @@ static int ksz_port_fdb_dump(struct dsa_switch *ds, int 
port,
do {
timeout = 1000;
do {
-   ksz_read32(dev, REG_SW_ALU_CTRL__4, );
-   if ((data & ALU_VALID) || !(data & ALU_START))
+   ksz_read32(dev, REG_SW_ALU_CTRL__4, _data);
+   if ((ksz_data & ALU_VALID) || !(ksz_data & ALU_START))
break;

[PATCH net-next v2 12/13] net: bridge: Remove FDB deletion through switchdev object

2017-07-19 Thread Arkadi Sharshevsky
At this point no driver supports FDB add/del through switchdev object
but rather via notification chain, thus, it is removed.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/bridge/br_fdb.c | 18 --
 1 file changed, 18 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a5e4a73..a79b648 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -169,29 +169,11 @@ static void fdb_del_hw_addr(struct net_bridge *br, const 
unsigned char *addr)
}
 }
 
-static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
-{
-   struct switchdev_obj_port_fdb fdb = {
-   .obj = {
-   .orig_dev = f->dst->dev,
-   .id = SWITCHDEV_OBJ_ID_PORT_FDB,
-   .flags = SWITCHDEV_F_DEFER,
-   },
-   .vid = f->vlan_id,
-   };
-
-   ether_addr_copy(fdb.addr, f->addr.addr);
-   switchdev_port_obj_del(f->dst->dev, );
-}
-
 static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 {
if (f->is_static)
fdb_del_hw_addr(br, f->addr.addr);
 
-   if (f->added_by_external_learn)
-   fdb_del_external_learn(f);
-
hlist_del_init_rcu(>hlist);
fdb_notify(br, f, RTM_DELNEIGH);
call_rcu(>rcu, fdb_rcu_free);
-- 
2.4.11



[PATCH net-next v2 09/13] net: dsa: Remove support for MDB dump from DSA's drivers

2017-07-19 Thread Arkadi Sharshevsky
This is done as a preparation before removing support for MDB dump from
DSA core. The MDBs are synced with the bridge and thus there is no
need for special dump operation support.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 drivers/net/dsa/microchip/ksz_common.c |  9 -
 drivers/net/dsa/mv88e6xxx/chip.c   | 24 
 2 files changed, 33 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index a53ce59..4de9d90 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1020,14 +1020,6 @@ static int ksz_port_mdb_del(struct dsa_switch *ds, int 
port,
return ret;
 }
 
-static int ksz_port_mdb_dump(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_mdb *mdb,
-switchdev_obj_dump_cb_t *cb)
-{
-   /* this is not called by switch layer */
-   return 0;
-}
-
 static int ksz_port_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror,
   bool ingress)
@@ -1090,7 +1082,6 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_mdb_prepare   = ksz_port_mdb_prepare,
.port_mdb_add   = ksz_port_mdb_add,
.port_mdb_del   = ksz_port_mdb_del,
-   .port_mdb_dump  = ksz_port_mdb_dump,
.port_mirror_add= ksz_port_mirror_add,
.port_mirror_del= ksz_port_mirror_del,
 };
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9cc6269..97b77b9 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1443,15 +1443,6 @@ static int mv88e6xxx_port_db_dump_fid(struct 
mv88e6xxx_chip *chip,
fdb->ndm_state = NUD_NOARP;
else
fdb->ndm_state = NUD_REACHABLE;
-   } else if (obj->id == SWITCHDEV_OBJ_ID_PORT_MDB) {
-   struct switchdev_obj_port_mdb *mdb;
-
-   if (!is_multicast_ether_addr(addr.mac))
-   continue;
-
-   mdb = SWITCHDEV_OBJ_PORT_MDB(obj);
-   mdb->vid = vid;
-   ether_addr_copy(mdb->addr, addr.mac);
} else {
return -EOPNOTSUPP;
}
@@ -3762,20 +3753,6 @@ static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, 
int port,
return err;
 }
 
-static int mv88e6xxx_port_mdb_dump(struct dsa_switch *ds, int port,
-  struct switchdev_obj_port_mdb *mdb,
-  switchdev_obj_dump_cb_t *cb)
-{
-   struct mv88e6xxx_chip *chip = ds->priv;
-   int err;
-
-   mutex_lock(>reg_lock);
-   err = mv88e6xxx_port_db_dump(chip, port, >obj, cb);
-   mutex_unlock(>reg_lock);
-
-   return err;
-}
-
 static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
.probe  = mv88e6xxx_drv_probe,
.get_tag_protocol   = mv88e6xxx_get_tag_protocol,
@@ -3809,7 +3786,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = 
{
.port_mdb_prepare   = mv88e6xxx_port_mdb_prepare,
.port_mdb_add   = mv88e6xxx_port_mdb_add,
.port_mdb_del   = mv88e6xxx_port_mdb_del,
-   .port_mdb_dump  = mv88e6xxx_port_mdb_dump,
.crosschip_bridge_join  = mv88e6xxx_crosschip_bridge_join,
.crosschip_bridge_leave = mv88e6xxx_crosschip_bridge_leave,
 };
-- 
2.4.11



[PATCH net-next v2 10/13] net: dsa: Remove redundant MDB dump support

2017-07-19 Thread Arkadi Sharshevsky
Currently the MDB HW database is synced with the bridge's one, thus,
There is no need to support special dump functionality.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
---
 include/net/dsa.h  |  4 
 net/dsa/dsa_priv.h |  2 --
 net/dsa/port.c | 11 ---
 net/dsa/slave.c|  3 ---
 4 files changed, 20 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4b82647..0f4912b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -406,10 +406,6 @@ struct dsa_switch_ops {
struct switchdev_trans *trans);
int (*port_mdb_del)(struct dsa_switch *ds, int port,
const struct switchdev_obj_port_mdb *mdb);
-   int (*port_mdb_dump)(struct dsa_switch *ds, int port,
-struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb);
-
/*
 * RXNFC
 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index cddcea2..897ac24 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -132,8 +132,6 @@ int dsa_port_mdb_add(struct dsa_port *dp,
 struct switchdev_trans *trans);
 int dsa_port_mdb_del(struct dsa_port *dp,
 const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb);
 int dsa_port_vlan_add(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan,
  struct switchdev_trans *trans);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ce19216..7378782 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -210,17 +210,6 @@ int dsa_port_mdb_del(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, );
 }
 
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
- switchdev_obj_dump_cb_t *cb)
-{
-   struct dsa_switch *ds = dp->ds;
-
-   if (ds->ops->port_mdb_dump)
-   return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb);
-
-   return -EOPNOTSUPP;
-}
-
 int dsa_port_vlan_add(struct dsa_port *dp,
  const struct switchdev_obj_port_vlan *vlan,
  struct switchdev_trans *trans)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f939d79..14f4d69 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -299,9 +299,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_FDB:
err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb);
break;
-   case SWITCHDEV_OBJ_ID_PORT_MDB:
-   err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
-   break;
default:
err = -EOPNOTSUPP;
break;
-- 
2.4.11



[PATCH net-next v2 03/13] net: dsa: Remove switchdev dependency from DSA switch notifier chain

2017-07-19 Thread Arkadi Sharshevsky
Currently, the switchdev objects are embedded inside the DSA notifier
info. This patch removes this dependency. This is done as a preparation
stage before adding support for learning FDB through the switchdev
notification chain.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/dsa_priv.h | 11 ++-
 net/dsa/port.c | 15 +--
 net/dsa/slave.c|  6 --
 net/dsa/switch.c   | 11 ---
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 428402f..2b2f124 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -43,9 +43,10 @@ struct dsa_notifier_bridge_info {
 
 /* DSA_NOTIFIER_FDB_* */
 struct dsa_notifier_fdb_info {
-   const struct switchdev_obj_port_fdb *fdb;
int sw_index;
int port;
+   const unsigned char *addr;
+   u16 vid;
 };
 
 /* DSA_NOTIFIER_MDB_* */
@@ -119,10 +120,10 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool 
vlan_filtering,
struct switchdev_trans *trans);
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 struct switchdev_trans *trans);
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
 int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
  switchdev_obj_dump_cb_t *cb);
 int dsa_port_mdb_add(struct dsa_port *dp,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bd271b9..86e0585 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -146,25 +146,28 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t 
ageing_clock,
return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, );
 }
 
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, );
 }
 
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
+
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, );
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b4e68b2..19395cc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -253,7 +253,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_FDB:
if (switchdev_trans_ph_prepare(trans))
return 0;
-   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
@@ -279,7 +280,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_FDB:
-   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index eb20e0f..e6c06aa 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,8 +83,6 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 static int dsa_switch_fdb_add(struct dsa_switch *ds,
  struct dsa_notifier_fdb_info *info)
 {
-   const struct switchdev_obj_port_fdb *fdb = info->fdb;
-
/* Do not care yet about other switch chips of the fabric */
if (ds->index != info->sw_index)
return 0;
@@ -92,14 +90,13 @@ static int ds

Re: [PATCH net-next 09/11] net: dsa: Move FDB dump implementation inside DSA

2017-07-19 Thread Arkadi Sharshevsky


On 07/18/2017 09:06 PM, Vivien Didelot wrote:
> Hi Arkadi,
> 
> Arkadi Sharshevsky <arka...@mellanox.com> writes:
> 
>> +typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
>> +  u16 ndm_state, void *data);
> 
> Can I ask you to change u16 ndm_state for bool is_static at the same
> time? Ethernet switches do not need to report more than that.
>

Will fix, thanks.

>> +static int
>> +dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid,
>> +   u16 ndm_state, void *data)
>> +{
>> +struct dsa_slave_dump_ctx *dump = data;
>> +u32 portid = NETLINK_CB(dump->cb->skb).portid;
>> +u32 seq = dump->cb->nlh->nlmsg_seq;
>> +struct nlmsghdr *nlh;
>> +struct ndmsg *ndm;
>> +
>> +if (dump->idx < dump->cb->args[2])
>> +goto skip;
>> +
>> +nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
>> +sizeof(*ndm), NLM_F_MULTI);
>> +if (!nlh)
>> +return -EMSGSIZE;
>> +
>> +ndm = nlmsg_data(nlh);
>> +ndm->ndm_family  = AF_BRIDGE;
>> +ndm->ndm_pad1= 0;
>> +ndm->ndm_pad2= 0;
>> +ndm->ndm_flags   = NTF_SELF;
>> +ndm->ndm_type= 0;
>> +ndm->ndm_ifindex = dump->dev->ifindex;
>> +ndm->ndm_state   = ndm_state;
> 
> So we can simply scope this here:
> 
> ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
> 
>> +
>> +if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
>> +goto nla_put_failure;
>> +
>> +if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
>> +goto nla_put_failure;
>> +
>> +nlmsg_end(dump->skb, nlh);
>> +
>> +skip:
>> +dump->idx++;
>> +return 0;
>> +
>> +nla_put_failure:
>> +nlmsg_cancel(dump->skb, nlh);
>> +return -EMSGSIZE;
>> +}
> 
> Other than that, LGTM.
> 
> 
> Thanks,
> 
> Vivien
> 


Re: [PATCH net-next 07/11] net: dsa: Remove support for bypass bridge port attributes/vlan set

2017-07-19 Thread Arkadi Sharshevsky


On 07/18/2017 08:40 PM, Vivien Didelot wrote:
> Hi Arkadi,
> 
> Arkadi Sharshevsky <arka...@mellanox.com> writes:
> 
>> The bridge port attributes/vlan for DSA devices should be set only
>> from bridge code. Furthermore, The vlans are synced totally with the
>> bridge so there is no need for special dump support.
>>
>> Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
>> ---
>>  drivers/net/dsa/b53/b53_common.c   | 44 --
>>  drivers/net/dsa/b53/b53_priv.h |  3 --
>>  drivers/net/dsa/bcm_sf2.c  |  1 -
>>  drivers/net/dsa/dsa_loop.c | 38 ---
>>  drivers/net/dsa/microchip/ksz_common.c | 41 -
>>  drivers/net/dsa/mv88e6xxx/chip.c   | 56 
>> --
>>  include/net/dsa.h  |  4 ---
>>  net/dsa/dsa_priv.h |  4 ---
>>  net/dsa/port.c | 12 
>>  net/dsa/slave.c|  6 
> 
> Regarding this massive deletion, can you please split it in two patches,
> one deleting first the DSA core usage of .port_vlan_dump, i.e. in:
> 
> net/dsa/dsa_priv.h
> net/dsa/port.c
> net/dsa/slave.c
> 
> Then a second patch which deletes the .port_vlan_dump implementations?
> 
> This may sound useless but it will actually make it easy for us to
> restore the VLAN dump support in drivers once we introduce an
> alternative way to query the hardware.
> 
> 
> Thanks,
> 
> Vivien
> 

Yeah, no problem. But doesn't it make more sense to remove it first from
the drivers and then from core? If it will be removed from core first it
will leave unused code behind in the driver. Furthermore, it is
symmetric with the code adding. You first add the core implementation
and then only the drivers.




Re: [PATCH net-next 04/11] net: dsa: Add support for learning FDB through notification

2017-07-19 Thread Arkadi Sharshevsky


On 07/18/2017 08:16 PM, Vivien Didelot wrote:
> Hi Arkadi,
> 
> Arkadi Sharshevsky <arka...@mellanox.com> writes:
> 
>> --- a/include/net/dsa.h
>> +++ b/include/net/dsa.h
>> @@ -451,6 +451,7 @@ void unregister_switch_driver(struct dsa_switch_driver 
>> *type);
>>  struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
>>  
>>  struct net_device *dsa_dev_to_net_device(struct device *dev);
>> +bool dsa_schedule_work(struct work_struct *work);
> 
> You forgot to move this declaration to net/dsa/dsa_priv.h, since this is
> private to DSA core and does not need to be exposed to drivers ;-)
>

No problem, will move it to dsa_priv.h, thanks.

>> +err = unregister_netdevice_notifier(_slave_switchdev_notifier);
>> +if (err)
>> +pr_err("DSA: failed to unregister switchdev notifier (%d)\n", 
>> err);
> 
> I think you meant unregister_switchdev_notifier() here.
> 
> Thanks,
> 
> Vivien
> 


[PATCH net-next 03/11] net: dsa: Remove switchdev dependency from DSA switch notifier chain

2017-07-18 Thread Arkadi Sharshevsky
Currently, the switchdev objects are embedded inside the DSA notifier
info. This patch removes this dependency. This is done as a preparation
stage before adding support for learning FDB through the switchdev
notification chain.

Signed-off-by: Arkadi Sharshevsky <arka...@mellanox.com>
Reviewed-by: Florian Fainelli <f.faine...@gmail.com>
Reviewed-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
---
 net/dsa/dsa_priv.h | 11 ++-
 net/dsa/port.c | 15 +--
 net/dsa/slave.c|  6 --
 net/dsa/switch.c   | 11 ---
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 428402f..2b2f124 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -43,9 +43,10 @@ struct dsa_notifier_bridge_info {
 
 /* DSA_NOTIFIER_FDB_* */
 struct dsa_notifier_fdb_info {
-   const struct switchdev_obj_port_fdb *fdb;
int sw_index;
int port;
+   const unsigned char *addr;
+   u16 vid;
 };
 
 /* DSA_NOTIFIER_MDB_* */
@@ -119,10 +120,10 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool 
vlan_filtering,
struct switchdev_trans *trans);
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 struct switchdev_trans *trans);
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid);
 int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
  switchdev_obj_dump_cb_t *cb);
 int dsa_port_mdb_add(struct dsa_port *dp,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bd271b9..86e0585 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -146,25 +146,28 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t 
ageing_clock,
return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, );
 }
 
-int dsa_port_fdb_add(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, );
 }
 
-int dsa_port_fdb_del(struct dsa_port *dp,
-const struct switchdev_obj_port_fdb *fdb)
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+u16 vid)
 {
struct dsa_notifier_fdb_info info = {
.sw_index = dp->ds->index,
.port = dp->index,
-   .fdb = fdb,
+   .addr = addr,
+   .vid = vid,
+
};
 
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, );
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b4e68b2..19395cc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -253,7 +253,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
case SWITCHDEV_OBJ_ID_PORT_FDB:
if (switchdev_trans_ph_prepare(trans))
return 0;
-   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
@@ -279,7 +280,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_FDB:
-   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
+   err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)->addr,
+  SWITCHDEV_OBJ_PORT_FDB(obj)->vid);
break;
case SWITCHDEV_OBJ_ID_PORT_MDB:
err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index eb20e0f..e6c06aa 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -83,8 +83,6 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 static int dsa_switch_fdb_add(struct dsa_switch *ds,
  struct dsa_notifier_fdb_info *info)
 {
-   const struct switchdev_obj_port_fdb *fdb = info->fdb;
-
/* Do not care yet about other switch chips of the fabric */
if (ds->index != info->sw_index)
return 0;
@@ -92,14 +90,13 @@ static int ds

  1   2   >