from:"Yuval Mintz"

RE: [PATCH net-next 1/1] bnx2x: Collect the device debug information during Tx timeout.

2018-05-24 Thread Yuval Mintz

> Tx-timeout mostly happens due to some issue in the device. In such cases,
> debug dump would be helpful for identifying the cause of the issue.
> This patch adds support to spill debug data during the Tx timeout. Here
> bnx2x_panic_dump() API is used instead of bnx2x_panic(), since we still
> want to allow the Tx-timeout recovery a chance to succeed.
> 
> Please consider applying this to "net-next".
> 
> Signed-off-by: Sudarsana Reddy Kalluru 
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 95871576..182d5e1 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -4962,8 +4962,13 @@ void bnx2x_tx_timeout(struct net_device *dev)
>  {
>   struct bnx2x *bp = netdev_priv(dev);
> 
> -#ifdef BNX2X_STOP_ON_ERROR
> + /* We want the information of the dump logged,
> +  * but calling bnx2x_panic() would kill all chances of recovery.
> +  */
>   if (!bp->panic)
> +#ifdef BNX2X_STOP_ON_ERROR
> + bnx2x_panic_dump(bp, false);
> +#else
>   bnx2x_panic();
>  #endif

This looks backward to me; When BNX2X_STOP_ON_ERROR is defined
you *want* bnx2x_panic() to fatally stop the device, not the other way
around.
I.e., s/ifdef/ifndef/ 

> 
> --
> 1.8.3.1

[PATCH iproute2-next] tc: Correct json output for actions

2018-04-04 Thread Yuval Mintz

Commit 9fd3f0b255d9 ("tc: enable json output for actions") added JSON
support for tc-actions at the expense of breaking other use cases that
reach tc_print_action(), as the latter don't expect the 'actions' array
to be a new object.

Consider the following taken duringrun of tc_chain.sh selftest,
and see the latter command output is broken:

$ ./tc/tc -j -p actions list action gact | grep -C 3 actions
[ {
"total acts": 1
},{
"actions": [ {
"order": 0,

$ ./tc/tc -p -j -s filter show dev enp3s0np2 ingress | grep -C 3 actions
},
"skip_hw": true,
"not_in_hw": true,{
"actions": [ {
"order": 1,
"kind": "gact",
"control_action": {

Relocate the open/close of the JSON object to declare the object only
for the case that needs it.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 tc/m_action.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index 2f85d35..8993b93 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -366,7 +366,6 @@ tc_print_action(FILE *f, const struct rtattr *arg, unsigned 
short tot_acts)
if (tab_flush && NULL != tb[0]  && NULL == tb[1])
return tc_print_action_flush(f, tb[0]);
 
-   open_json_object(NULL);
open_json_array(PRINT_JSON, "actions");
for (i = 0; i <= tot_acts; i++) {
if (tb[i]) {
@@ -383,7 +382,6 @@ tc_print_action(FILE *f, const struct rtattr *arg, unsigned 
short tot_acts)
 
}
close_json_array(PRINT_JSON, NULL);
-   close_json_object();
 
return 0;
 }
@@ -439,8 +437,9 @@ int print_action(const struct sockaddr_nl *who,
}
}
 
-
+   open_json_object(NULL);
tc_print_action(fp, tb[TCA_ACT_TAB], tot_acts ? *tot_acts:0);
+   close_json_object();
 
return 0;
 }
-- 
2.4.3

Re: [PATCH net-next 3/4] qed: Adapter flash update support.

2018-03-27 Thread Yuval Mintz

On Mon, Mar 26, 2018 at 03:13:47AM -0700, Sudarsana Reddy Kalluru wrote:
> This patch adds the required driver support for updating the flash or
> non volatile memory of the adapter. At highlevel, flash upgrade comprises
> of reading the flash images from the input file, validating the images and
> writing it to the respective paritions.

s/it/them/

[...]

> + * 
> /--\
> + * 0B  |   0x4 [command index]   
>  |
> + * 4B  | image_type | Options|  Number of register settings  
>  |
> + * 8B  |   Value 
>  |
> + * 12B |   Mask  
>  |
> + * 16B |   Offset
>  |
> + * 
> \--/
> + * There can be several Value-Mask-Offset sets as specified by 'Number 
> of...'.
> + * Options - 0'b - Calculate & Update CRC for image
> + */
> +static int qed_nvm_flash_image_access(struct qed_dev *cdev, const u8 **data,
> +   bool *check_resp)
> +{
> + struct qed_nvm_image_att nvm_image;
> + struct qed_hwfn *p_hwfn;
> + bool is_crc = false;
> + u32 image_type;
> + int rc = 0, i;
> + u16 len;
> +
 +
> + nvm_image.start_addr = p_hwfn->nvm_info.image_att[i].nvm_start_addr;
> + nvm_image.length = p_hwfn->nvm_info.image_att[i].len;
> +
> + DP_VERBOSE(cdev, NETIF_MSG_DRV,
> +"Read image %02x; type = %08x; NVM [%08x,...,%08x]\n",
> +**data, nvm_image.start_addr, image_type,
> +nvm_image.start_addr + nvm_image.length - 1);

Looks like 3rd and 4th printed parameters are flipped.


> + (*data)++;
> + is_crc = !!(**data);

If you'd actually want to be able to use the reserved bits
[forward-compatibility] in the future, you should check bit 0 instead of
checking the byte.

> + (*data)++;
> + len = *((u16 *)*data);
> + *data += 2;

[...]

> +
> +/* Binary file format -
> + * 
> /--\
> + * 0B  |   0x3 [command index]   
>  |
> + * 4B  | b'0: check_response?   | b'1-127  reserved  
>  |
This shows there are 128 bits in a 4 byte field.

> + * 8B  | File-type |   reserved  
>  |
> + * 
> \--/
> + * Start a new file of the provided type
> + */
> +static int qed_nvm_flash_image_file_start(struct qed_dev *cdev,
> +   const u8 **data, bool *check_resp)
> +{
> + int rc;
> +
> + *data += 4;
> + *check_resp = !!(**data);

Like above

> + *data += 4;
> +
> + DP_VERBOSE(cdev, NETIF_MSG_DRV,
> +"About to start a new file of type %02x\n", **data);
> + rc = qed_mcp_nvm_put_file_begin(cdev, **data);
> + *data += 4;
> +
> + return rc;
> +}
> +
> +/* Binary file format -
> + * 
> /--\
> + * 0B  |   0x2 [command index]   
>  |
> + * 4B  |   Length in bytes   
>  |
> + * 8B  | b'0: check_response?   | b'1-127  reserved  
>  |

Same as above

> + * 12B |   Offset in bytes   
>  |
> + * 16B |   Data ...  
>  |
> + * 
> \--/
> + * Write data as part of a file that was previously started. Data should 
> be
> + * of length equal to that provided in the message
> + */
> +static int qed_nvm_flash_image_file_data(struct qed_dev *cdev,
> +  const u8 **data, bool *check_resp)
> +{
> + u32 offset, len;
> + int rc;
> +
> + *data += 4;
> + len = *((u32 *)(*data));
> + *data += 4;
> + *check_resp = !!(**data);

Same as above

> + *data += 4;
> + offset = *((u32 *)(*data));
> + *data += 4;
> +
> + DP_VERBOSE(cdev, NETIF_MSG_DRV,
> +"About to write File-data: %08x bytes to offset %08x\n",
> +len, offset);
> +
> + rc = qed_mcp_nvm_write(cdev, QED_PUT_FILE_DATA, offset,
> +(char *)(*data), len);
> + *data += len;
> +
> + return rc;
> +}

[...]

> +
> +static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
> +{
> + rc = qed_nvm_flash_image_validate(cdev, image, );
> + if (rc)
> + goto exit;
> +
> + while (data < data_end) {
> + bool check_resp = false;
> +
> + /* Parse

Re: [PATCH net-next 1/4] qed: Populate nvm image attribute shadow.

2018-03-27 Thread Yuval Mintz

On Mon, Mar 26, 2018 at 03:13:45AM -0700, Sudarsana Reddy Kalluru wrote:
> This patch add support for populating the flash image attributes.
s/add/adds/

[...]

> -int qed_mcp_bist_nvm_test_get_image_att(struct qed_hwfn *p_hwfn,
> - struct qed_ptt *p_ptt,
> - struct bist_nvm_image_att *p_image_att,
> +int qed_mcp_bist_nvm_get_image_att(struct qed_hwfn *p_hwfn,
> +struct qed_ptt *p_ptt,
> +struct bist_nvm_image_att *p_image_att,
>   u32 image_index)

Indentation seems broken.

>  
> +int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn)
> +{
> + struct qed_nvm_image_info *nvm_info = _hwfn->nvm_info;
> + struct qed_ptt *p_ptt;
> + int rc;
> + u32 i;
> +
> + p_ptt = qed_ptt_acquire(p_hwfn);
> + if (!p_ptt) {
> + DP_ERR(p_hwfn, "failed to acquire ptt\n");
> + return -EBUSY;
> + }
> +
> + /* Acquire from MFW the amount of available images */
> + nvm_info->num_images = 0;
> + rc = qed_mcp_bist_nvm_get_num_images(p_hwfn,
> +  p_ptt, _info->num_images);
> + if (rc == -EOPNOTSUPP) {
> + DP_INFO(p_hwfn, "DRV_MSG_CODE_BIST_TEST is not supported\n");
> + goto out;
> + } else if ((rc != 0) || (nvm_info->num_images == 0)) {

rc || !nvm_info->num_images

> + DP_ERR(p_hwfn, "Failed getting number of images\n");
> + goto err0;
> + }
> +
> + nvm_info->image_att =
> + kmalloc(nvm_info->num_images * sizeof(struct bist_nvm_image_att),
> + GFP_KERNEL);

Indentation can be better than this.

[...]

> --- a/drivers/net/ethernet/qlogic/qed/qed_selftest.c
> +++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
> @@ -125,10 +125,11 @@ int qed_selftest_nvram(struct qed_dev *cdev)
>   }
>  
>   /* Acquire from MFW the amount of available images */
> - rc = qed_mcp_bist_nvm_test_get_num_images(p_hwfn, p_ptt, _images);
> + rc = qed_mcp_bist_nvm_get_num_images(p_hwfn, p_ptt, _images);
>   if (rc || !num_images) {
>   DP_ERR(p_hwfn, "Failed getting number of images\n");
> - return -EINVAL;
> + rc = -EINVAL;
> + goto err0;

Well, this one is a bug fix [Failure flow currently leaks a PTT entry].
If you don't want to treat it as one that's fine, but I think it
deserves its own patch in the series.

>   }
>  
>   /* Iterate over images and validate CRC */
> @@ -136,8 +137,8 @@ int qed_selftest_nvram(struct qed_dev *cdev)
>   /* This mailbox returns information about the image required for
>* reading it.
>*/
> - rc = qed_mcp_bist_nvm_test_get_image_att(p_hwfn, p_ptt,
> -  _att, i);
> + rc = qed_mcp_bist_nvm_get_image_att(p_hwfn, p_ptt,
> + _att, i);
>   if (rc) {
>   DP_ERR(p_hwfn,
>  "Failed getting image index %d attributes\n",
> -- 
> 1.8.3.1
>

RE: [PATCH RESEND net-next] net: Do synchronize_rcu() in ip6mr_sk_done() only if this is needed

2018-03-07 Thread Yuval Mintz

> >>> After unshare test kworker hangs for ages:
> >>>
> >>> $ while :; do unshare -n true; done &
> >>>
> >>> $ perf report 
> >>> -   88,82% 0,00%  kworker/u16:0  [kernel.vmlinux]  [k]
> >>> cleanup_net
> >>>  cleanup_net
> >>>    - ops_exit_list.isra.9
> >>>   - 85,68% igmp6_net_exit
> >>>  - 53,31% sock_release
> >>> - inet_release
> >>>    - 25,33% rawv6_close
> >>>   - ip6mr_sk_done
> >>>  + 23,38% synchronize_rcu
> >>>
> >>> Keep in mind, this perf report shows the time a function was
> >>> executing, and
> >>> it does not show the time, it was sleeping. But it's easy to imagine,
> >>> how
> >>> much it is sleeping, if synchronize_rcu() execution takes the most
> >>> time.
> >>> Top shows the kworker R time is less then 1%.
> >>>
> >>> This happen, because of in ip6mr_sk_done() we do too many
> >>> synchronize_rcu(),
> >>> even for the sockets, that are not referenced in mr_table, and which
> >>> are not
> >>> need it. So, the progress of kworker becomes very slow.
> >>>
> >>> The patch introduces apparent solution, and it makes ip6mr_sk_done()
> >>> to skip
> >>> synchronize_rcu() for sockets, that are not need that. After the
> >>> patch,
> >>> kworker becomes able to warm the cpu up as expected.
> >>>
> >>> Signed-off-by: Kirill Tkhai 
> >>> ---
> >>>  net/ipv6/ip6mr.c |4 +++-
> >>>  1 file changed, 3 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
> >>> index 2a38f9de45d3..290a8d0d5eac 100644
> >>> --- a/net/ipv6/ip6mr.c
> >>> +++ b/net/ipv6/ip6mr.c
> >>> @@ -1485,7 +1485,9 @@ int ip6mr_sk_done(struct sock *sk)
> >>>   }
> >>>   }
> >>>   rtnl_unlock();
> >>> - synchronize_rcu();
> >>> +
> >>> + if (!err)
> >>> + synchronize_rcu();
> >>>
> >>
> >>
> >> But... what is this synchronize_rcu() doing exactly ?
> >>
> >> This was added in 8571ab479a6e1ef46ead5ebee567e128a422767c
> >>
> >> ("ip6mr: Make mroute_sk rcu-based")
> >>
> >> Typically on a delete, the synchronize_rcu() would be needed before
> >> freeing the deleted object.
> >>
> >> But nowadays we have better way : SOCK_RCU_FREE
> >
> > Hm. I'm agree with you. This is hot path, and mroute sockets created from
> userspace
> > will delay userspace tasks close() and exit(). Since there may be many such
> sockets,
> > we may get a zombie task, which can't be reaped for ages. This slows down
> the system
> > directly.
> >
> > Fix for pernet_operations works, but we need generic solution instead.
> >
> > The commit "8571ab479a6e1ef46ead5ebee567e128a422767c" says:
> >
> > ip6mr: Make mroute_sk rcu-based
> >
> > In ipmr the mr_table socket is handled under RCU. Introduce the same
> > for ip6mr.
> >
> > There is no pointing to improvements it invents, or to the problem it 
> > solves.
> The description
> > looks like a cleanup. It's too expensive cleanup, if it worsens the
> performance a hundred
> > times.
> >
> > Can't we simply revert it?!
> >
> > Yuval, do you have ideas to fix that (maybe, via SOCK_RCU_FREE suggested
> by Eric)?

Sorry, failed to notice ip6mr_sk_done() is called unconditionally from
rawv6_close(). But even with your suggested fix it should be ~resolved
[as only sockets used for ip6mr would reach the sync].
Or are you claiming there's still some performance hit even with your
suggested change?

> >
> > We actually use rcu_dereference() in ip6mr_cache_report() only. The only
> user of dereference
> > is sock_queue_rcv_skb(). Superficial analysis shows we purge the queue in
> inet_sock_destruct().
> 
> + So this change should be safe.

I might have misunderstood the comment from
commit 4c9687098f24 ("ipmr: RCU conversion of mroute_sk") when writing
this; Thought comment regarding ip_ra_destroy() meant that for the IPv6 case
we DO have to make sure there's a grace-period before destroying the socket.

> 
> > Thanks,
> > Kirill
> >

RE: [PATCH V2 net] qed: Free RoCE ILT Memory on rmmod qedr

2018-03-05 Thread Yuval Mintz

> - /* Free Task CXT */
> + /* Free Task CXT ( Intentionally RoCE as task-id is shared between
> +  * RoCE and iWARP
> +  */

Broken parenthesis In comment...

[PATCH v2 net-next 05/11] ipmr, ip6mr: Unite creation of new mr_table

2018-02-28 Thread Yuval Mintz

Now that both ipmr and ip6mr are using the same mr_table structure,
we can have a common function to allocate & initialize a new instance.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute_base.h | 17 +
 net/ipv4/ipmr.c | 27 ++-
 net/ipv4/ipmr_base.c| 27 +++
 net/ipv6/ip6mr.c| 30 ++
 4 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 1cc944a..8053057 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -85,6 +85,13 @@ void vif_device_init(struct vif_device *v,
 unsigned char threshold,
 unsigned short flags,
 unsigned short get_iflink_mask);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net));
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -94,5 +101,15 @@ static inline void vif_device_init(struct vif_device *v,
   unsigned short get_iflink_mask)
 {
 }
+
+static inline struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   return NULL;
+}
 #endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a1bf002..f213933 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -352,6 +352,14 @@ static const struct rhashtable_params ipmr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ipmr_new_table_set(struct mr_table *mrt,
+  struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv4.mr_tables);
+#endif
+}
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -364,23 +372,8 @@ static struct mr_table *ipmr_new_table(struct net *net, 
u32 id)
if (mrt)
return mrt;
 
-   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-   if (!mrt)
-   return ERR_PTR(-ENOMEM);
-   write_pnet(>net, net);
-   mrt->id = id;
-
-   rhltable_init(>mfc_hash, _rht_params);
-   INIT_LIST_HEAD(>mfc_cache_list);
-   INIT_LIST_HEAD(>mfc_unres_queue);
-
-   timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
-
-   mrt->mroute_reg_vif_num = -1;
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
-   list_add_tail_rcu(>list, >ipv4.mr_tables);
-#endif
-   return mrt;
+   return mr_table_alloc(net, id, _rht_params,
+ ipmr_expire_process, ipmr_new_table_set);
 }
 
 static void ipmr_free_table(struct mr_table *mrt)
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 22758f8..3e21a58 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -26,3 +26,30 @@ void vif_device_init(struct vif_device *v,
v->link = dev->ifindex;
 }
 EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   struct mr_table *mrt;
+
+   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+   if (!mrt)
+   return NULL;
+   mrt->id = id;
+   write_pnet(>net, net);
+
+   rhltable_init(>mfc_hash, rht_params);
+   INIT_LIST_HEAD(>mfc_cache_list);
+   INIT_LIST_HEAD(>mfc_unres_queue);
+
+   timer_setup(>ipmr_expire_timer, expire_func, 0);
+
+   mrt->mroute_reg_vif_num = -1;
+   table_set(mrt, net);
+   return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index adbb826..d508528 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -295,6 +294,14 @@ static const struct rhashtable_params ip6mr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ip6mr_new_table_set(struct mr_table *mrt,
+   struct net *net)
+{
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv6.mr6_tables);
+#endif
+}
+
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr_t

[PATCH v2 net-next 08/11] ipmr, ip6mr: Unite mfc seq logic

2018-02-28 Thread Yuval Mintz

With the exception of the final dump, ipmr and ip6mr have the exact same
seq logic for traversing a given mr_table. Refactor that code and make
it common.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute_base.h | 69 
 net/ipv4/ipmr.c | 93 +++
 net/ipv4/ipmr_base.c| 62 +
 net/ipv6/ip6mr.c| 97 -
 4 files changed, 143 insertions(+), 178 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 46a082e..a007c5a 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -203,4 +204,72 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 {
return mr_mfc_find_parent(mrt, hasharg, -1);
 }
+
+#ifdef CONFIG_PROC_FS
+struct mr_mfc_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   struct list_head *cache;
+
+   /* Lock protecting the mr_table's unresolved queue */
+   spinlock_t *lock;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_seq_idx(struct net *net,
+struct mr_mfc_iter *it, loff_t pos);
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos);
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   struct mr_mfc_iter *it = seq->private;
+
+   it->mrt = mrt;
+   it->cache = NULL;
+   it->lock = lock;
+
+   return *pos ? mr_mfc_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+   struct mr_mfc_iter *it = seq->private;
+   struct mr_table *mrt = it->mrt;
+
+   if (it->cache == >mfc_unres_queue)
+   spin_unlock_bh(it->lock);
+   else if (it->cache == >mfc_cache_list)
+   rcu_read_unlock();
+}
+#else
+static inline void *mr_mfc_seq_idx(struct net *net,
+  struct mr_mfc_iter *it, loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+   loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   return NULL;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+#endif
+#endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 00898c3..1eb19d9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3014,41 +3014,8 @@ static const struct file_operations ipmr_vif_fops = {
.release = seq_release_net,
 };
 
-struct ipmr_mfc_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   struct list_head *cache;
-};
-
-static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
- struct ipmr_mfc_iter *it, loff_t pos)
-{
-   struct mr_table *mrt = it->mrt;
-   struct mr_mfc *mfc;
-
-   rcu_read_lock();
-   it->cache = >mfc_cache_list;
-   list_for_each_entry_rcu(mfc, >mfc_cache_list, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-   rcu_read_unlock();
-
-   spin_lock_bh(_unres_lock);
-   it->cache = >mfc_unres_queue;
-   list_for_each_entry(mfc, it->cache, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-
-   spin_unlock_bh(_unres_lock);
-
-   it->cache = NULL;
-   return NULL;
-}
-
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-   struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -3056,57 +3023,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
loff_t *pos)
if (!mrt)
return ERR_PTR(-ENOENT);
 
-   it->mrt = mrt;
-   it->cache = NULL;
-   return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_mfc_iter *it = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = it->mrt;
-   struct mfc_cache *mfc = v;
-
-   ++*pos;
-
-   if (v == SEQ_START_TOKEN)
-   r

[PATCH v2 net-next 00/11] ipmr, ip6mr: Align multicast routing for IPv4 & IPv6

2018-02-28 Thread Yuval Mintz

Historically ip6mr was based [cut-n-paste] on ipmr and the two have not
diverged too much. Apparently as ipv4 multicast routing is more common
than its ipv6 brethren modifications since then are mostly one-way,
affecting ipmr while leaving ip6mr unchanged.

This series is meant to re-factor both ipmr and ip6mr into having common
structures [and some functionality], adding 2 new common files -
mroute_base.h and ipmr_base.c.

The series begins by bringing ip6mr up to speed to some of the changes
applied in the past to ipmr [#2, #3].
It is then possible to re-factor a lot of the common structures - 
vif devices [#1], mr_table [#4] mfc_cache [#6], and use the common
structures in both ipmr and ip6mr.

The rest of the patches re-factor some choice flows used by both ipmr
and ip6mr and eliminates duplicity.

This series would later allow for easy extension of ipmr offloading
to support ip6mr offloading as well, as almost all structures
related to the offloading would be shared between the two protocols.

Changes from previous versions
--
v2:
  - #6 Corrected reporting logic when hitting an unresolved cache
  - #7 Addressed kernel doc style [Thanks Nikolay]

RFC -> v1:
  - Corrected support for CONFIG_IP{,V6}_MROUTE_MULTIPLE_TABLES
  - Addressed a couple of kbuild test robot issues

Yuval Mintz (11):
  ipmr,ipmr6: Define a uniform vif_device
  ip6mr: Make mroute_sk rcu-based
  ip6mr: Align hash implementation to ipmr
  mroute*: Make mr_table a common struct
  ipmr, ip6mr: Unite creation of new mr_table
  ipmr, ip6mr: Make mfc_cache a common structure
  ipmr, ip6mr: Unite logic for searching in MFC cache
  ipmr, ip6mr: Unite mfc seq logic
  ipmr, ip6mr: Unite vif seq functions
  ip6mr: Remove MFC_NOTIFY and refactor flags
  ipmr, ip6mr: Unite dumproute flows

 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  88 +-
 include/linux/mroute6.h   |  62 +-
 include/linux/mroute_base.h   | 346 
 include/net/netns/ipv6.h  |   2 +-
 net/ipv4/Kconfig  |   5 +
 net/ipv4/Makefile |   1 +
 net/ipv4/ipmr.c   | 582 -
 net/ipv4/ipmr_base.c  | 323 +++
 net/ipv6/Kconfig  |   1 +
 net/ipv6/ip6_output.c |   2 +-
 net/ipv6/ip6mr.c  | 988 --
 12 files changed, 1249 insertions(+), 1172 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

-- 
2.4.3

[PATCH v2 net-next 06/11] ipmr, ip6mr: Make mfc_cache a common structure

2018-02-28 Thread Yuval Mintz

mfc_cache and mfc6_cache are almost identical - the main difference is
in the origin/group addresses and comparison-key. Make a common
structure encapsulating most of the multicast routing logic  - mr_mfc
and convert both ipmr and ip6mr into using it.

For easy conversion [casting, in this case] mr_mfc has to be the first
field inside every multicast routing abstraction utilizing it.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  45 +---
 include/linux/mroute6.h   |  23 +-
 include/linux/mroute_base.h   |  45 
 net/ipv4/ipmr.c   | 233 ++--
 net/ipv6/ip6mr.c  | 248 +++---
 6 files changed, 312 insertions(+), 303 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
index d20b143..978a3c7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -126,8 +126,8 @@ mlxsw_sp_mr_route_ivif_in_evifs(const struct 
mlxsw_sp_mr_route *mr_route)
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   ivif = mr_route->mfc4->mfc_parent;
-   return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
+   ivif = mr_route->mfc4->_c.mfc_parent;
+   return mr_route->mfc4->_c.mfc_un.res.ttls[ivif] != 255;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
default:
@@ -364,7 +364,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->mfc4 = mfc;
mr_route->mr_table = mr_table;
for (i = 0; i < MAXVIFS; i++) {
-   if (mfc->mfc_un.res.ttls[i] != 255) {
+   if (mfc->_c.mfc_un.res.ttls[i] != 255) {
err = mlxsw_sp_mr_route_evif_link(mr_route,
  _table->vifs[i]);
if (err)
@@ -374,7 +374,8 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
}
}
-   mlxsw_sp_mr_route_ivif_link(mr_route, _table->vifs[mfc->mfc_parent]);
+   mlxsw_sp_mr_route_ivif_link(mr_route,
+   _table->vifs[mfc->_c.mfc_parent]);
 
mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
return mr_route;
@@ -418,9 +419,9 @@ static void mlxsw_sp_mr_mfc_offload_set(struct 
mlxsw_sp_mr_route *mr_route,
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
if (offload)
-   mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags |= MFC_OFFLOAD;
else
-   mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags &= ~MFC_OFFLOAD;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
@@ -943,10 +944,10 @@ static void mlxsw_sp_mr_route_stats_update(struct 
mlxsw_sp *mlxsw_sp,
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   if (mr_route->mfc4->mfc_un.res.pkt != packets)
-   mr_route->mfc4->mfc_un.res.lastuse = jiffies;
-   mr_route->mfc4->mfc_un.res.pkt = packets;
-   mr_route->mfc4->mfc_un.res.bytes = bytes;
+   if (mr_route->mfc4->_c.mfc_un.res.pkt != packets)
+   mr_route->mfc4->_c.mfc_un.res.lastuse = jiffies;
+   mr_route->mfc4->_c.mfc_un.res.pkt = packets;
+   mr_route->mfc4->_c.mfc_un.res.bytes = bytes;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 8688c5d..63b36e6 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -81,28 +81,13 @@ struct mfc_cache_cmp_arg {
 
 /**
  * struct mfc_cache - multicast routing entries
- * @mnode: rhashtable list
+ * @_c: Common multicast routing information; has to be first [for casting]
  * @mfc_mcastgrp: destination multicast group address
  * @mfc_origin: source address
  * @cmparg: used for rhashtable comparisons
- * @mfc_parent: source interface (iif)
- * @mfc_flags: entry flags
- * @expires: unresolved entry expire time
- * @unresolved: unresolved cached skbs
- * @last_assert: time of last assert
- * @minvif: minimum VIF id
- * @maxvif: maximum VIF id
- * @bytes: bytes that have passed for this entry
- * @pkt: packets that have pas

[PATCH v2 net-next 04/11] mroute*: Make mr_table a common struct

2018-02-28 Thread Yuval Mintz

Following previous changes to ip6mr, mr_table and mr6_table are
basically the same [up to mr6_table having additional '6' suffixes to
its variable names].
Move the common structure definition into a common header; This
requires renaming all references in ip6mr to variables that had the
distinct suffix.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute.h  |  21 
 include/linux/mroute6.h |   1 -
 include/linux/mroute_base.h |  46 +++
 include/net/netns/ipv6.h|   2 +-
 net/ipv4/ipmr.c |   2 -
 net/ipv6/ip6mr.c| 301 
 6 files changed, 186 insertions(+), 187 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b8aadff..8688c5d 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -4,8 +4,6 @@
 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
@@ -67,25 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-
-struct mr_table {
-   struct list_headlist;
-   possible_net_t  net;
-   u32 id;
-   struct sock __rcu   *mroute_sk;
-   struct timer_list   ipmr_expire_timer;
-   struct list_headmfc_unres_queue;
-   struct vif_device   vif_table[MAXVIFS];
-   struct rhltable mfc_hash;
-   struct list_headmfc_cache_list;
-   int maxvif;
-   atomic_tcache_resolve_queue_len;
-   boolmroute_do_assert;
-   boolmroute_do_pim;
-   int mroute_reg_vif_num;
-};
-
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
  * MFC_OFFLOAD - the entry was offloaded to the hardware
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e2dac19..d5c8dc1 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 0de651e..1cc944a 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -2,6 +2,9 @@
 #define __LINUX_MROUTE_BASE_H
 
 #include 
+#include 
+#include 
+#include 
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -32,6 +35,49 @@ struct vif_device {
__be32 local, remote;
 };
 
+#ifndef MAXVIFS
+/* This one is nasty; value is defined in uapi using different symbols for
+ * mroute and morute6 but both map into same 32.
+ */
+#define MAXVIFS32
+#endif
+
+#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
+
+/**
+ * struct mr_table - a multicast routing table
+ * @list: entry within a list of multicast routing tables
+ * @net: net where this table belongs
+ * @id: identifier of the table
+ * @mroute_sk: socket associated with the table
+ * @ipmr_expire_timer: timer for handling unresolved routes
+ * @mfc_unres_queue: list of unresolved MFC entries
+ * @vif_table: array containing all possible vifs
+ * @mfc_hash: Hash table of all resolved routes for easy lookup
+ * @mfc_cache_list: list of resovled routes for possible traversal
+ * @maxvif: Identifier of highest value vif currently in use
+ * @cache_resolve_queue_len: current size of unresolved queue
+ * @mroute_do_assert: Whether to inform userspace on wrong ingress
+ * @mroute_do_pim: Whether to receive IGMP PIMv1
+ * @mroute_reg_vif_num: PIM-device vif index
+ */
+struct mr_table {
+   struct list_headlist;
+   possible_net_t  net;
+   u32 id;
+   struct sock __rcu   *mroute_sk;
+   struct timer_list   ipmr_expire_timer;
+   struct list_headmfc_unres_queue;
+   struct vif_device   vif_table[MAXVIFS];
+   struct rhltable mfc_hash;
+   struct list_headmfc_cache_list;
+   int maxvif;
+   atomic_tcache_resolve_queue_len;
+   boolmroute_do_assert;
+   boolmroute_do_pim;
+   int mroute_reg_vif_num;
+};
+
 #ifdef CONFIG_IP_MROUTE_COMMON
 void vif_device_init(struct vif_device *v,
 struct net_device *dev,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc45..0d177fa9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -84,7 +84,7 @@ struct netns_ipv6 {
struct sock *mc_autojoin_sk;
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-   struct mr6_table*mrt6;
+   struct mr_table *mrt6;
 #else
struct list_headmr6_tabl

[PATCH v2 net-next 03/11] ip6mr: Align hash implementation to ipmr

2018-02-28 Thread Yuval Mintz

Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has
been using rhashtable as a basis for its mfc routes, but ip6mr is
currently still using the old private MFC hash implementation.

Align ip6mr to the current ipmr implementation.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute6.h |  30 ++---
 net/ipv6/ip6mr.c| 313 ++--
 2 files changed, 184 insertions(+), 159 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e1b9fb0..e2dac19 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -65,10 +66,20 @@ static inline void ip6_mr_cleanup(void)
 
 #define VIFF_STATIC 0x8000
 
+struct mfc6_cache_cmp_arg {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+};
+
 struct mfc6_cache {
-   struct list_head list;
-   struct in6_addr mf6c_mcastgrp;  /* Group the entry 
belongs to   */
-   struct in6_addr mf6c_origin;/* Source of packet 
*/
+   struct rhlist_head mnode;
+   union {
+   struct {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+   };
+   struct mfc6_cache_cmp_arg cmparg;
+   };
mifi_t mf6c_parent; /* Source interface 
*/
int mfc_flags;  /* Flags on line
*/
 
@@ -88,22 +99,13 @@ struct mfc6_cache {
unsigned char ttls[MAXMIFS];/* TTL thresholds   
*/
} res;
} mfc_un;
+   struct list_head list;
+   struct rcu_head rcu;
 };
 
 #define MFC_STATIC 1
 #define MFC_NOTIFY 2
 
-#define MFC6_LINES 64
-
-#define MFC6_HASH(a, g) (((__force u32)(a)->s6_addr32[0] ^ \
- (__force u32)(a)->s6_addr32[1] ^ \
- (__force u32)(a)->s6_addr32[2] ^ \
- (__force u32)(a)->s6_addr32[3] ^ \
- (__force u32)(g)->s6_addr32[0] ^ \
- (__force u32)(g)->s6_addr32[1] ^ \
- (__force u32)(g)->s6_addr32[2] ^ \
- (__force u32)(g)->s6_addr32[3]) % MFC6_LINES)
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a0e297d..6f0b7f4 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -61,8 +61,9 @@ struct mr6_table {
struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
-   struct list_headmfc6_cache_array[MFC6_LINES];
struct vif_device   vif6_table[MAXMIFS];
+   struct rhltable mfc6_hash;
+   struct list_headmfc6_cache_list;
int maxvif;
atomic_tcache_resolve_queue_len;
boolmroute_do_assert;
@@ -299,10 +300,29 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 }
 #endif
 
+static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+   const struct mfc6_cache_cmp_arg *cmparg = arg->key;
+   struct mfc6_cache *c = (struct mfc6_cache *)ptr;
+
+   return !ipv6_addr_equal(>mf6c_mcastgrp, >mf6c_mcastgrp) ||
+  !ipv6_addr_equal(>mf6c_origin, >mf6c_origin);
+}
+
+static const struct rhashtable_params ip6mr_rht_params = {
+   .head_offset = offsetof(struct mfc6_cache, mnode),
+   .key_offset = offsetof(struct mfc6_cache, cmparg),
+   .key_len = sizeof(struct mfc6_cache_cmp_arg),
+   .nelem_hint = 3,
+   .locks_mul = 1,
+   .obj_cmpfn = ip6mr_hash_cmp,
+   .automatic_shrinking = true,
+};
+
 static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr6_table *mrt;
-   unsigned int i;
 
mrt = ip6mr_get_table(net, id);
if (mrt)
@@ -314,10 +334,8 @@ static struct mr6_table *ip6mr_new_table(struct net *net, 
u32 id)
mrt->id = id;
write_pnet(>net, net);
 
-   /* Forwarding cache */
-   for (i = 0; i < MFC6_LINES; i++)
-   INIT_LIST_HEAD(>mfc6_cache_array[i]);
-
+   rhltable_init(>mfc6_hash, _rht_params);
+   INIT_LIST_HEAD(>mfc6_cache_list);
INIT_LIST_HEAD(>mfc6_unres_queue);
 
timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
@@ -335,6 +353,7 @@ static void ip6mr_free_table(struct mr6_table *mrt)
 {
del_timer_sync(>ipmr_expire_timer);
mroute_clean_tab

[PATCH v2 net-next 10/11] ip6mr: Remove MFC_NOTIFY and refactor flags

2018-02-28 Thread Yuval Mintz

MFC_NOTIFY exists in ip6mr, probably as some legacy code
[was already removed for ipmr in commit
06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags 
enum").
Remove it from ip6mr as well, and move the enum into a common file;
Notice MFC_OFFLOAD is currently only used by ipmr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute.h  | 9 -
 include/linux/mroute6.h | 3 ---
 include/linux/mroute_base.h | 9 +
 net/ipv6/ip6mr.c| 3 ---
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 63b36e6..7ed82e4 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -65,15 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-/* mfc_flags:
- * MFC_STATIC - the entry was added statically (not by a routing daemon)
- * MFC_OFFLOAD - the entry was offloaded to the hardware
- */
-enum {
-   MFC_STATIC = BIT(0),
-   MFC_OFFLOAD = BIT(1),
-};
-
 struct mfc_cache_cmp_arg {
__be32 mfc_mcastgrp;
__be32 mfc_origin;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 6acf576..1ac38e6 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -81,9 +81,6 @@ struct mfc6_cache {
};
 };
 
-#define MFC_STATIC 1
-#define MFC_NOTIFY 2
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index cfaec9b..f40202b 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -45,6 +45,15 @@ struct vif_device {
 
 #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
 
+/* mfc_flags:
+ * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
+ */
+enum {
+   MFC_STATIC = BIT(0),
+   MFC_OFFLOAD = BIT(1),
+};
+
 /**
  * struct mr_mfc - common multicast routing entries
  * @mnode: rhashtable list
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index ddd9e6b..c3b3f1c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2203,9 +2203,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, 
struct rtmsg *rtm,
return err;
}
 
-   if (rtm->rtm_flags & RTM_F_NOTIFY)
-   cache->_c.mfc_flags |= MFC_NOTIFY;
-
err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(_lock);
return err;
-- 
2.4.3

[PATCH v2 net-next 09/11] ipmr, ip6mr: Unite vif seq functions

2018-02-28 Thread Yuval Mintz

Same as previously done with the mfc seq, the logic for the vif seq is
refactored to be shared between ipmr and ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute_base.h | 33 ++
 net/ipv4/ipmr.c | 49 +---
 net/ipv4/ipmr_base.c| 33 ++
 net/ipv6/ip6mr.c| 50 +
 4 files changed, 76 insertions(+), 89 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index a007c5a..cfaec9b 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -206,6 +206,12 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 }
 
 #ifdef CONFIG_PROC_FS
+struct mr_vif_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   int ct;
+};
+
 struct mr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
@@ -216,6 +222,16 @@ struct mr_mfc_iter {
 };
 
 #ifdef CONFIG_IP_MROUTE_COMMON
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos);
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return *pos ? mr_vif_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
 /* These actually return 'struct mr_mfc *', but to avoid need for explicit
  * castings they simply return void.
  */
@@ -249,6 +265,23 @@ static inline void mr_mfc_seq_stop(struct seq_file *seq, 
void *v)
rcu_read_unlock();
 }
 #else
+static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter,
+  loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_next(struct seq_file *seq,
+   void *v, loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return NULL;
+}
+
 static inline void *mr_mfc_seq_idx(struct net *net,
   struct mr_mfc_iter *it, loff_t pos)
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1eb19d9..f5ff542 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2908,31 +2908,11 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, 
struct netlink_callback *cb)
 /* The /proc interfaces to multicast routing :
  * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
-struct ipmr_vif_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   int ct;
-};
-
-static struct vif_device *ipmr_vif_seq_idx(struct net *net,
-   struct ipmr_vif_iter *iter,
-   loff_t pos)
-{
-   struct mr_table *mrt = iter->mrt;
-
-   for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   if (pos-- == 0)
-   return >vif_table[iter->ct];
-   }
-   return NULL;
-}
 
 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(mrt_lock)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -2943,26 +2923,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, 
loff_t *pos)
iter->mrt = mrt;
 
read_lock(_lock);
-   return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_vif_iter *iter = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = iter->mrt;
-
-   ++*pos;
-   if (v == SEQ_START_TOKEN)
-   return ipmr_vif_seq_idx(net, iter, 0);
-
-   while (++iter->ct < mrt->maxvif) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   return >vif_table[iter->ct];
-   }
-   return NULL;
+   return mr_vif_seq_start(seq, pos);
 }
 
 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
@@ -2973,7 +2934,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void 
*v)
 
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct mr_table *mrt = iter->mrt;
 
if (v == SEQ_START_TOKEN) {
@@ -2996,7 +2957,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void 
*v)
 
 static const struct seq_operations ipmr_vif_seq_ops = {
.start

[PATCH v2 net-next 07/11] ipmr, ip6mr: Unite logic for searching in MFC cache

2018-02-28 Thread Yuval Mintz

ipmr and ip6mr utilize the exact same methods for searching the
hashed resolved connections, difference being only in the construction
of the hash comparison key.

In order to unite the flow, introduce an mr_table operation set that
would contain the protocol specific information required for common
flows, in this case - the hash parameters and a comparison key
representing a (*,*) route.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 52 +--
 net/ipv4/ipmr.c | 71 ++-
 net/ipv4/ipmr_base.c| 54 +++--
 net/ipv6/ip6mr.c| 74 +++--
 4 files changed, 134 insertions(+), 117 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 2769e2f..46a082e 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -89,10 +89,23 @@ struct mr_mfc {
struct rcu_head rcu;
 };
 
+struct mr_table;
+
+/**
+ * struct mr_table_ops - callbacks and info for protocol-specific ops
+ * @rht_params: parameters for accessing the MFC hash
+ * @cmparg_any: a hash key to be used for matching on (*,*) routes
+ */
+struct mr_table_ops {
+   const struct rhashtable_params *rht_params;
+   void *cmparg_any;
+};
+
 /**
  * struct mr_table - a multicast routing table
  * @list: entry within a list of multicast routing tables
  * @net: net where this table belongs
+ * @ops: protocol specific operations
  * @id: identifier of the table
  * @mroute_sk: socket associated with the table
  * @ipmr_expire_timer: timer for handling unresolved routes
@@ -109,6 +122,7 @@ struct mr_mfc {
 struct mr_table {
struct list_headlist;
possible_net_t  net;
+   struct mr_table_ops ops;
u32 id;
struct sock __rcu   *mroute_sk;
struct timer_list   ipmr_expire_timer;
@@ -133,10 +147,19 @@ void vif_device_init(struct vif_device *v,
 
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net));
+
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_find_parent(struct mr_table *mrt,
+void *hasharg, int parent);
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
+
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -147,14 +170,37 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline struct mr_table *
+static inline void *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net))
 {
return NULL;
 }
+
+static inline void *mr_mfc_find_parent(struct mr_table *mrt,
+  void *hasharg, int parent)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_find_any_parent(struct mr_table *mrt,
+  int vifi)
+{
+   return NULL;
+}
+
+static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt,
+int vifi, void *hasharg)
+{
+   return NULL;
+}
 #endif
+
+static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
+{
+   return mr_mfc_find_parent(mrt, hasharg, -1);
+}
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f75150..00898c3 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -360,6 +360,16 @@ static void ipmr_new_table_set(struct mr_table *mrt,
 #endif
 }
 
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+   .mfc_mcastgrp = htonl(INADDR_ANY),
+   .mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+   .rht_params = _rht_params,
+   .cmparg_any = _mr_table_ops_cmparg_any,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -372,7 +382,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 
id)
if (mrt)
return mrt;
 
-   return mr_table_alloc(net, id, _rht_params,
+   return mr_table_alloc(net, id, _mr_table_ops,
  ipmr_expire_process, ipmr_new_table_set);
 }
 
@@ -973,33 +983,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table

[PATCH v2 net-next 02/11] ip6mr: Make mroute_sk rcu-based

2018-02-28 Thread Yuval Mintz

In ipmr the mr_table socket is handled under RCU. Introduce the same
for ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute6.h |  6 +++---
 net/ipv6/ip6_output.c   |  2 +-
 net/ipv6/ip6mr.c| 45 +++--
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e5e5b82..e1b9fb0 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -111,12 +111,12 @@ extern int ip6mr_get_route(struct net *net, struct 
sk_buff *skb,
   struct rtmsg *rtm, u32 portid);
 
 #ifdef CONFIG_IPV6_MROUTE
-extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb);
+bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
 extern int ip6mr_sk_done(struct sock *sk);
 #else
-static inline struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 {
-   return NULL;
+   return false;
 }
 static inline int ip6mr_sk_done(struct sock *sk)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 997c7f1..a6eb0e6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
-   ((mroute6_socket(net, skb) &&
+   ((mroute6_is_socket(net, skb) &&
 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 ipv6_chk_mcast_addr(dev, _hdr(skb)->daddr,
 _hdr(skb)->saddr))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e397990..a0e297d 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -58,7 +58,7 @@ struct mr6_table {
struct list_headlist;
possible_net_t  net;
u32 id;
-   struct sock *mroute6_sk;
+   struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
struct list_headmfc6_cache_array[MFC6_LINES];
@@ -1121,6 +1121,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr6_table *mrt,
 static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert)
 {
+   struct sock *mroute6_sk;
struct sk_buff *skb;
struct mrt6msg *msg;
int ret;
@@ -1190,17 +1191,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, 
struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
 
-   if (!mrt->mroute6_sk) {
+   rcu_read_lock();
+   mroute6_sk = rcu_dereference(mrt->mroute6_sk);
+   if (!mroute6_sk) {
+   rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
 
mrt6msg_netlink_event(mrt, skb);
 
-   /*
-*  Deliver to user space multicast routing algorithms
-*/
-   ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+   /* Deliver to user space multicast routing algorithms */
+   ret = sock_queue_rcv_skb(mroute6_sk, skb);
+   rcu_read_unlock();
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping 
entries\n");
kfree_skb(skb);
@@ -1584,11 +1587,11 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct 
sock *sk)
 
rtnl_lock();
write_lock_bh(_lock);
-   if (likely(mrt->mroute6_sk == NULL)) {
-   mrt->mroute6_sk = sk;
-   net->ipv6.devconf_all->mc_forwarding++;
-   } else {
+   if (rtnl_dereference(mrt->mroute6_sk)) {
err = -EADDRINUSE;
+   } else {
+   rcu_assign_pointer(mrt->mroute6_sk, sk);
+   net->ipv6.devconf_all->mc_forwarding++;
}
write_unlock_bh(_lock);
 
@@ -1614,9 +1617,9 @@ int ip6mr_sk_done(struct sock *sk)
 
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
-   if (sk == mrt->mroute6_sk) {
+   if (sk == rtnl_dereference(mrt->mroute6_sk)) {
write_lock_bh(_lock);
-   mrt->mroute6_sk = NULL;
+   RCU_INIT_POINTER(mrt->mroute6_sk, NULL);
net->ipv6.devconf_all->mc_forwarding--;
write_unlock_bh(_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1630,11 +1633,12 @@ int ip6mr_sk_done(struct sock *sk)
}
}
rtnl_unlock();
+   synchronize_rcu(

[PATCH v2 net-next 01/11] ipmr,ipmr6: Define a uniform vif_device

2018-02-28 Thread Yuval Mintz

The two implementations have almost identical structures - vif_device and
mif_device. As a step toward uniforming the mr_tables, eliminate the
mif_device and relocate the vif_device definition into a new common
header file.

Also, introduce a common initializing function for setting most of the
vif_device fields in a new common source file. This requires modifying
the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common
config option - CONFIG_IP_MROUTE_COMMON.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute.h  | 13 +---
 include/linux/mroute6.h | 11 +-
 include/linux/mroute_base.h | 52 +
 net/ipv4/Kconfig|  5 +
 net/ipv4/Makefile   |  1 +
 net/ipv4/ipmr.c | 32 +---
 net/ipv4/ipmr_base.c| 28 
 net/ipv6/Kconfig|  1 +
 net/ipv6/ip6mr.c| 37 
 9 files changed, 117 insertions(+), 63 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5396521..b8aadff 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IP_MROUTE
 static inline int ip_mroute_opt(int opt)
@@ -56,18 +57,6 @@ static inline bool ipmr_rule_default(const struct fib_rule 
*rule)
 }
 #endif
 
-struct vif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   struct netdev_phys_item_id dev_parent_id;   /* Device parent ID
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   __be32  local,remote;   /* Addresses(remote for 
tunnels)*/
-   int link;   /* Physical interface index 
*/
-};
-
 struct vif_entry_notifier_info {
struct fib_notifier_info info;
struct net_device *dev;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 3014c52..e5e5b82 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -7,6 +7,7 @@
 #include   /* for struct sk_buff_head */
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -62,16 +63,6 @@ static inline void ip6_mr_cleanup(void)
 }
 #endif
 
-struct mif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   int link;   /* Physical interface index 
*/
-};
-
 #define VIFF_STATIC 0x8000
 
 struct mfc6_cache {
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
new file mode 100644
index 000..0de651e
--- /dev/null
+++ b/include/linux/mroute_base.h
@@ -0,0 +1,52 @@
+#ifndef __LINUX_MROUTE_BASE_H
+#define __LINUX_MROUTE_BASE_H
+
+#include 
+
+/**
+ * struct vif_device - interface representor for multicast routing
+ * @dev: network device being used
+ * @bytes_in: statistic; bytes ingressing
+ * @bytes_out: statistic; bytes egresing
+ * @pkt_in: statistic; packets ingressing
+ * @pkt_out: statistic; packets egressing
+ * @rate_limit: Traffic shaping (NI)
+ * @threshold: TTL threshold
+ * @flags: Control flags
+ * @link: Physical interface index
+ * @dev_parent_id: device parent id
+ * @local: Local address
+ * @remote: Remote address for tunnels
+ */
+struct vif_device {
+   struct net_device *dev;
+   unsigned long bytes_in, bytes_out;
+   unsigned long pkt_in, pkt_out;
+   unsigned long rate_limit;
+   unsigned char threshold;
+   unsigned short flags;
+   int link;
+
+   /* Currently only used by ipmr */
+   struct netdev_phys_item_id dev_parent_id;
+   __be32 local, remote;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+void vif_device_init(struct vif_device *v,
+struct net_device *dev,
+unsigned long rate_limit,
+unsigned char threshold,
+unsigned short flags,
+unsigned short get_iflink_mask);
+#else
+static inline void vif_device_init(struct v

[PATCH v2 net-next 11/11] ipmr, ip6mr: Unite dumproute flows

2018-02-28 Thread Yuval Mintz

The various MFC entries are being held in the same kind of mr_tables
for both ipmr and ip6mr, and their traversal logic is identical.
Also, with the exception of the addresses [and other small tidbits]
the major bulk of the nla setting is identical.

Unite as much of the dumping as possible between the two.
Notice this requires creating an mr_table iterator for each, as the
for-each preprocessor macro can't be used by the common logic.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
Acked-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
---
 include/linux/mroute_base.h |  29 
 net/ipv4/ipmr.c | 161 +++-
 net/ipv4/ipmr_base.c| 123 +
 net/ipv6/ip6mr.c| 156 +++---
 4 files changed, 230 insertions(+), 239 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index f40202b..c2560cb 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -170,6 +170,16 @@ void *mr_mfc_find_parent(struct mr_table *mrt,
 void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
 void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
 
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+  struct mr_mfc *c, struct rtmsg *rtm);
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock);
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -207,6 +217,25 @@ static inline struct mr_mfc *mr_mfc_find_any(struct 
mr_table *mrt,
 {
return NULL;
 }
+
+static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+struct mr_mfc *c, struct rtmsg *rtm)
+{
+   return -EINVAL;
+}
+
+static inline int
+mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock)
+{
+   return -EINVAL;
+}
 #endif
 
 static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index f5ff542..d752a70 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -105,8 +105,6 @@ static void ip_mr_forward(struct net *net, struct mr_table 
*mrt,
  struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct mr_table *mrt,
 struct sk_buff *pkt, vifi_t vifi, int assert);
-static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- struct mr_mfc *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -117,6 +115,23 @@ static void ipmr_expire_process(struct timer_list *t);
 #define ipmr_for_each_table(mrt, net) \
list_for_each_entry_rcu(mrt, >ipv4.mr_tables, list)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   struct mr_table *ret;
+
+   if (!mrt)
+   ret = list_entry_rcu(net->ipv4.mr_tables.next,
+struct mr_table, list);
+   else
+   ret = list_entry_rcu(mrt->list.next,
+struct mr_table, list);
+
+   if (>list == >ipv4.mr_tables)
+   return NULL;
+   return ret;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -284,6 +299,14 @@ EXPORT_SYMBOL(ipmr_rule_default);
 #define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   if (!mrt)
+   return net->ipv4.mrt;
+   return NULL;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
return net->ipv4.mrt;
@@ -1051,8 +1074,8 @@ static void ipmr_cache_resolve(struct net *net, struct 
mr_table *mrt,

Re: [PATCH net-next 06/11] ipmr, ip6mr: Make mfc_cache a common structure

2018-02-28 Thread Yuval Mintz

On Wed, Feb 28, 2018 at 12:38:20AM +0200, Nikolay Aleksandrov wrote:
> On 27/02/18 20:58, Yuval Mintz wrote:
> > mfc_cache and mfc6_cache are almost identical - the main difference is
> > in the origin/group addresses and comparison-key. Make a common
> > structure encapsulating most of the multicast routing logic  - mr_mfc
> > and convert both ipmr and ip6mr into using it.
> > 
> > For easy conversion [casting, in this case] mr_mfc has to be the first
> > field inside every multicast routing abstraction utilizing it.
> > 
> > Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
> > ---
> >  drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
> >  include/linux/mroute.h|  45 +---
> >  include/linux/mroute6.h   |  23 +--
> >  include/linux/mroute_base.h   |  45 
> >  net/ipv4/ipmr.c   | 234 
> > +++--
> >  net/ipv6/ip6mr.c  | 241 
> > +++---
> >  6 files changed, 312 insertions(+), 297 deletions(-)
> > 
> 
> I feel uneasy about these casts all over the place, anyway functionally
> the patch looks fine.

Well, testing revealed a bug in handling unresolved cache entries;
I'll fix it as well in v2.

RE: [PATCH net-next] team: Use extack to report enslavement failures

2018-02-27 Thread Yuval Mintz

> >> > If so, for how long? They should certainly be removed eventually. How
> >> > do we ensure we don't forget?
> >> >
> >> > Seems to me it would be better to remove them right now.
> >>
> >> I can do that unless someone objects.
> >
> >I don't object, but FWIW keep in mind extack errors don't show if
> >libmnl is not installed..
> 
> Yeah, or if you have an older iproute2 package. I would keep the existing
> dmesg msgs for now. In the future, when everyone is used to exacks, then
> we can remove them.

Perhaps it makes sense to introduce netdev_nl_err_msg(dev, extack, msg)
that would do both today and refactor the code to use it?
Later it could be changed to do only NL_SET_ERR_MSG.

[PATCH net-next 04/11] mroute*: Make mr_table a common struct

2018-02-27 Thread Yuval Mintz

Following previous changes to ip6mr, mr_table and mr6_table are
basically the same [up to mr6_table having additional '6' suffixes to
its variable names].
Move the common structure definition into a common header; This
requires renaming all references in ip6mr to variables that had the
distinct suffix.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  |  21 
 include/linux/mroute6.h |   1 -
 include/linux/mroute_base.h |  46 +++
 include/net/netns/ipv6.h|   2 +-
 net/ipv4/ipmr.c |   2 -
 net/ipv6/ip6mr.c| 301 
 6 files changed, 186 insertions(+), 187 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b8aadff..8688c5d 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -4,8 +4,6 @@
 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
@@ -67,25 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-
-struct mr_table {
-   struct list_headlist;
-   possible_net_t  net;
-   u32 id;
-   struct sock __rcu   *mroute_sk;
-   struct timer_list   ipmr_expire_timer;
-   struct list_headmfc_unres_queue;
-   struct vif_device   vif_table[MAXVIFS];
-   struct rhltable mfc_hash;
-   struct list_headmfc_cache_list;
-   int maxvif;
-   atomic_tcache_resolve_queue_len;
-   boolmroute_do_assert;
-   boolmroute_do_pim;
-   int mroute_reg_vif_num;
-};
-
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
  * MFC_OFFLOAD - the entry was offloaded to the hardware
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e2dac19..d5c8dc1 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 0de651e..1cc944a 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -2,6 +2,9 @@
 #define __LINUX_MROUTE_BASE_H
 
 #include 
+#include 
+#include 
+#include 
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -32,6 +35,49 @@ struct vif_device {
__be32 local, remote;
 };
 
+#ifndef MAXVIFS
+/* This one is nasty; value is defined in uapi using different symbols for
+ * mroute and morute6 but both map into same 32.
+ */
+#define MAXVIFS32
+#endif
+
+#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
+
+/**
+ * struct mr_table - a multicast routing table
+ * @list: entry within a list of multicast routing tables
+ * @net: net where this table belongs
+ * @id: identifier of the table
+ * @mroute_sk: socket associated with the table
+ * @ipmr_expire_timer: timer for handling unresolved routes
+ * @mfc_unres_queue: list of unresolved MFC entries
+ * @vif_table: array containing all possible vifs
+ * @mfc_hash: Hash table of all resolved routes for easy lookup
+ * @mfc_cache_list: list of resovled routes for possible traversal
+ * @maxvif: Identifier of highest value vif currently in use
+ * @cache_resolve_queue_len: current size of unresolved queue
+ * @mroute_do_assert: Whether to inform userspace on wrong ingress
+ * @mroute_do_pim: Whether to receive IGMP PIMv1
+ * @mroute_reg_vif_num: PIM-device vif index
+ */
+struct mr_table {
+   struct list_headlist;
+   possible_net_t  net;
+   u32 id;
+   struct sock __rcu   *mroute_sk;
+   struct timer_list   ipmr_expire_timer;
+   struct list_headmfc_unres_queue;
+   struct vif_device   vif_table[MAXVIFS];
+   struct rhltable mfc_hash;
+   struct list_headmfc_cache_list;
+   int maxvif;
+   atomic_tcache_resolve_queue_len;
+   boolmroute_do_assert;
+   boolmroute_do_pim;
+   int mroute_reg_vif_num;
+};
+
 #ifdef CONFIG_IP_MROUTE_COMMON
 void vif_device_init(struct vif_device *v,
 struct net_device *dev,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc45..0d177fa9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -84,7 +84,7 @@ struct netns_ipv6 {
struct sock *mc_autojoin_sk;
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-   struct mr6_table*mrt6;
+   struct mr_table *mrt6;
 #else
struct list_headmr6_tables;
struct fib_rules_ops*mr6_rules_ops;
diff --git a/net/ipv4/

[PATCH net-next 06/11] ipmr, ip6mr: Make mfc_cache a common structure

2018-02-27 Thread Yuval Mintz

mfc_cache and mfc6_cache are almost identical - the main difference is
in the origin/group addresses and comparison-key. Make a common
structure encapsulating most of the multicast routing logic  - mr_mfc
and convert both ipmr and ip6mr into using it.

For easy conversion [casting, in this case] mr_mfc has to be the first
field inside every multicast routing abstraction utilizing it.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  45 +---
 include/linux/mroute6.h   |  23 +--
 include/linux/mroute_base.h   |  45 
 net/ipv4/ipmr.c   | 234 +++--
 net/ipv6/ip6mr.c  | 241 +++---
 6 files changed, 312 insertions(+), 297 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
index d20b143..978a3c7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -126,8 +126,8 @@ mlxsw_sp_mr_route_ivif_in_evifs(const struct 
mlxsw_sp_mr_route *mr_route)
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   ivif = mr_route->mfc4->mfc_parent;
-   return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
+   ivif = mr_route->mfc4->_c.mfc_parent;
+   return mr_route->mfc4->_c.mfc_un.res.ttls[ivif] != 255;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
default:
@@ -364,7 +364,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->mfc4 = mfc;
mr_route->mr_table = mr_table;
for (i = 0; i < MAXVIFS; i++) {
-   if (mfc->mfc_un.res.ttls[i] != 255) {
+   if (mfc->_c.mfc_un.res.ttls[i] != 255) {
err = mlxsw_sp_mr_route_evif_link(mr_route,
  _table->vifs[i]);
if (err)
@@ -374,7 +374,8 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
}
}
-   mlxsw_sp_mr_route_ivif_link(mr_route, _table->vifs[mfc->mfc_parent]);
+   mlxsw_sp_mr_route_ivif_link(mr_route,
+   _table->vifs[mfc->_c.mfc_parent]);
 
mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
return mr_route;
@@ -418,9 +419,9 @@ static void mlxsw_sp_mr_mfc_offload_set(struct 
mlxsw_sp_mr_route *mr_route,
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
if (offload)
-   mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags |= MFC_OFFLOAD;
else
-   mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags &= ~MFC_OFFLOAD;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
@@ -943,10 +944,10 @@ static void mlxsw_sp_mr_route_stats_update(struct 
mlxsw_sp *mlxsw_sp,
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   if (mr_route->mfc4->mfc_un.res.pkt != packets)
-   mr_route->mfc4->mfc_un.res.lastuse = jiffies;
-   mr_route->mfc4->mfc_un.res.pkt = packets;
-   mr_route->mfc4->mfc_un.res.bytes = bytes;
+   if (mr_route->mfc4->_c.mfc_un.res.pkt != packets)
+   mr_route->mfc4->_c.mfc_un.res.lastuse = jiffies;
+   mr_route->mfc4->_c.mfc_un.res.pkt = packets;
+   mr_route->mfc4->_c.mfc_un.res.bytes = bytes;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 8688c5d..63b36e6 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -81,28 +81,13 @@ struct mfc_cache_cmp_arg {
 
 /**
  * struct mfc_cache - multicast routing entries
- * @mnode: rhashtable list
+ * @_c: Common multicast routing information; has to be first [for casting]
  * @mfc_mcastgrp: destination multicast group address
  * @mfc_origin: source address
  * @cmparg: used for rhashtable comparisons
- * @mfc_parent: source interface (iif)
- * @mfc_flags: entry flags
- * @expires: unresolved entry expire time
- * @unresolved: unresolved cached skbs
- * @last_assert: time of last assert
- * @minvif: minimum VIF id
- * @maxvif: maximum VIF id
- * @bytes: bytes that have passed for this entry
- * @pkt: packets that have pas

[PATCH net-next 10/11] ip6mr: Remove MFC_NOTIFY and refactor flags

2018-02-27 Thread Yuval Mintz

MFC_NOTIFY exists in ip6mr, probably as some legacy code
[was already removed for ipmr in commit
06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags 
enum").
Remove it from ip6mr as well, and move the enum into a common file;
Notice MFC_OFFLOAD is currently only used by ipmr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  | 9 -
 include/linux/mroute6.h | 3 ---
 include/linux/mroute_base.h | 9 +
 net/ipv6/ip6mr.c| 3 ---
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 63b36e6..7ed82e4 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -65,15 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-/* mfc_flags:
- * MFC_STATIC - the entry was added statically (not by a routing daemon)
- * MFC_OFFLOAD - the entry was offloaded to the hardware
- */
-enum {
-   MFC_STATIC = BIT(0),
-   MFC_OFFLOAD = BIT(1),
-};
-
 struct mfc_cache_cmp_arg {
__be32 mfc_mcastgrp;
__be32 mfc_origin;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 6acf576..1ac38e6 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -81,9 +81,6 @@ struct mfc6_cache {
};
 };
 
-#define MFC_STATIC 1
-#define MFC_NOTIFY 2
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index edc6e6b..2054118 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -45,6 +45,15 @@ struct vif_device {
 
 #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
 
+/* mfc_flags:
+ * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
+ */
+enum {
+   MFC_STATIC = BIT(0),
+   MFC_OFFLOAD = BIT(1),
+};
+
 /**
  * struct mr_mfc - common multicast routing entries
  * @mnode: rhashtable list
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9773b80..0709ae3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2208,9 +2208,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, 
struct rtmsg *rtm,
return err;
}
 
-   if (rtm->rtm_flags & RTM_F_NOTIFY)
-   cache->_c.mfc_flags |= MFC_NOTIFY;
-
err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(_lock);
return err;
-- 
2.4.3

[PATCH net-next 11/11] ipmr, ip6mr: Unite dumproute flows

2018-02-27 Thread Yuval Mintz

The various MFC entries are being held in the same kind of mr_tables
for both ipmr and ip6mr, and their traversal logic is identical.
Also, with the exception of the addresses [and other small tidbits]
the major bulk of the nla setting is identical.

Unite as much of the dumping as possible between the two.
Notice this requires creating an mr_table iterator for each, as the
for-each preprocessor macro can't be used by the common logic.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h |  29 
 net/ipv4/ipmr.c | 161 +++-
 net/ipv4/ipmr_base.c| 123 +
 net/ipv6/ip6mr.c| 156 +++---
 4 files changed, 230 insertions(+), 239 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 2054118..2da30da 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -170,6 +170,16 @@ void *mr_mfc_find_parent(struct mr_table *mrt,
 void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
 void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
 
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+  struct mr_mfc *c, struct rtmsg *rtm);
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock);
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -207,6 +217,25 @@ static inline struct mr_mfc *mr_mfc_find_any(struct 
mr_table *mrt,
 {
return NULL;
 }
+
+static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+struct mr_mfc *c, struct rtmsg *rtm)
+{
+   return -EINVAL;
+}
+
+static inline int
+mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock)
+{
+   return -EINVAL;
+}
 #endif
 
 static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 0363524..ffc929f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -104,8 +104,6 @@ static void ip_mr_forward(struct net *net, struct mr_table 
*mrt,
  struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct mr_table *mrt,
 struct sk_buff *pkt, vifi_t vifi, int assert);
-static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- struct mr_mfc *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -116,6 +114,23 @@ static void ipmr_expire_process(struct timer_list *t);
 #define ipmr_for_each_table(mrt, net) \
list_for_each_entry_rcu(mrt, >ipv4.mr_tables, list)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   struct mr_table *ret;
+
+   if (!mrt)
+   ret = list_entry_rcu(net->ipv4.mr_tables.next,
+struct mr_table, list);
+   else
+   ret = list_entry_rcu(mrt->list.next,
+struct mr_table, list);
+
+   if (>list == >ipv4.mr_tables)
+   return NULL;
+   return ret;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -283,6 +298,14 @@ EXPORT_SYMBOL(ipmr_rule_default);
 #define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   if (!mrt)
+   return net->ipv4.mrt;
+   return NULL;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
return net->ipv4.mrt;
@@ -1050,8 +1073,8 @@ static void ipmr_cache_resolve(struct net *net, struct 
mr_table *mrt,
struc

[PATCH net-next 03/11] ip6mr: Align hash implementation to ipmr

2018-02-27 Thread Yuval Mintz

Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has
been using rhashtable as a basis for its mfc routes, but ip6mr is
currently still using the old private MFC hash implementation.

Align ip6mr to the current ipmr implementation.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute6.h |  30 ++---
 net/ipv6/ip6mr.c| 313 ++--
 2 files changed, 184 insertions(+), 159 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e1b9fb0..e2dac19 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -65,10 +66,20 @@ static inline void ip6_mr_cleanup(void)
 
 #define VIFF_STATIC 0x8000
 
+struct mfc6_cache_cmp_arg {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+};
+
 struct mfc6_cache {
-   struct list_head list;
-   struct in6_addr mf6c_mcastgrp;  /* Group the entry 
belongs to   */
-   struct in6_addr mf6c_origin;/* Source of packet 
*/
+   struct rhlist_head mnode;
+   union {
+   struct {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+   };
+   struct mfc6_cache_cmp_arg cmparg;
+   };
mifi_t mf6c_parent; /* Source interface 
*/
int mfc_flags;  /* Flags on line
*/
 
@@ -88,22 +99,13 @@ struct mfc6_cache {
unsigned char ttls[MAXMIFS];/* TTL thresholds   
*/
} res;
} mfc_un;
+   struct list_head list;
+   struct rcu_head rcu;
 };
 
 #define MFC_STATIC 1
 #define MFC_NOTIFY 2
 
-#define MFC6_LINES 64
-
-#define MFC6_HASH(a, g) (((__force u32)(a)->s6_addr32[0] ^ \
- (__force u32)(a)->s6_addr32[1] ^ \
- (__force u32)(a)->s6_addr32[2] ^ \
- (__force u32)(a)->s6_addr32[3] ^ \
- (__force u32)(g)->s6_addr32[0] ^ \
- (__force u32)(g)->s6_addr32[1] ^ \
- (__force u32)(g)->s6_addr32[2] ^ \
- (__force u32)(g)->s6_addr32[3]) % MFC6_LINES)
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a0e297d..6f0b7f4 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -61,8 +61,9 @@ struct mr6_table {
struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
-   struct list_headmfc6_cache_array[MFC6_LINES];
struct vif_device   vif6_table[MAXMIFS];
+   struct rhltable mfc6_hash;
+   struct list_headmfc6_cache_list;
int maxvif;
atomic_tcache_resolve_queue_len;
boolmroute_do_assert;
@@ -299,10 +300,29 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 }
 #endif
 
+static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+   const struct mfc6_cache_cmp_arg *cmparg = arg->key;
+   struct mfc6_cache *c = (struct mfc6_cache *)ptr;
+
+   return !ipv6_addr_equal(>mf6c_mcastgrp, >mf6c_mcastgrp) ||
+  !ipv6_addr_equal(>mf6c_origin, >mf6c_origin);
+}
+
+static const struct rhashtable_params ip6mr_rht_params = {
+   .head_offset = offsetof(struct mfc6_cache, mnode),
+   .key_offset = offsetof(struct mfc6_cache, cmparg),
+   .key_len = sizeof(struct mfc6_cache_cmp_arg),
+   .nelem_hint = 3,
+   .locks_mul = 1,
+   .obj_cmpfn = ip6mr_hash_cmp,
+   .automatic_shrinking = true,
+};
+
 static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr6_table *mrt;
-   unsigned int i;
 
mrt = ip6mr_get_table(net, id);
if (mrt)
@@ -314,10 +334,8 @@ static struct mr6_table *ip6mr_new_table(struct net *net, 
u32 id)
mrt->id = id;
write_pnet(>net, net);
 
-   /* Forwarding cache */
-   for (i = 0; i < MFC6_LINES; i++)
-   INIT_LIST_HEAD(>mfc6_cache_array[i]);
-
+   rhltable_init(>mfc6_hash, _rht_params);
+   INIT_LIST_HEAD(>mfc6_cache_list);
INIT_LIST_HEAD(>mfc6_unres_queue);
 
timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
@@ -335,6 +353,7 @@ static void ip6mr_free_table(struct mr6_table *mrt)
 {
del_timer_sync(>ipmr_expire_timer);
mroute_clean_tables(mrt, true);
+   rhltable_destroy(>mfc6_hash

[PATCH net-next 08/11] ipmr, ip6mr: Unite mfc seq logic

2018-02-27 Thread Yuval Mintz

With the exception of the final dump, ipmr and ip6mr have the exact same
seq logic for traversing a given mr_table. Refactor that code and make
it common.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 69 
 net/ipv4/ipmr.c | 93 +++
 net/ipv4/ipmr_base.c| 62 +
 net/ipv6/ip6mr.c| 97 -
 4 files changed, 143 insertions(+), 178 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 18a1d75..413f103 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -203,4 +204,72 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 {
return mr_mfc_find_parent(mrt, hasharg, -1);
 }
+
+#ifdef CONFIG_PROC_FS
+struct mr_mfc_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   struct list_head *cache;
+
+   /* Lock protecting the mr_table's unresolved queue */
+   spinlock_t *lock;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_seq_idx(struct net *net,
+struct mr_mfc_iter *it, loff_t pos);
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos);
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   struct mr_mfc_iter *it = seq->private;
+
+   it->mrt = mrt;
+   it->cache = NULL;
+   it->lock = lock;
+
+   return *pos ? mr_mfc_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+   struct mr_mfc_iter *it = seq->private;
+   struct mr_table *mrt = it->mrt;
+
+   if (it->cache == >mfc_unres_queue)
+   spin_unlock_bh(it->lock);
+   else if (it->cache == >mfc_cache_list)
+   rcu_read_unlock();
+}
+#else
+static inline void *mr_mfc_seq_idx(struct net *net,
+  struct mr_mfc_iter *it, loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+   loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   return NULL;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+#endif
+#endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d0bf021..22b47e0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3014,41 +3014,8 @@ static const struct file_operations ipmr_vif_fops = {
.release = seq_release_net,
 };
 
-struct ipmr_mfc_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   struct list_head *cache;
-};
-
-static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
- struct ipmr_mfc_iter *it, loff_t pos)
-{
-   struct mr_table *mrt = it->mrt;
-   struct mr_mfc *mfc;
-
-   rcu_read_lock();
-   it->cache = >mfc_cache_list;
-   list_for_each_entry_rcu(mfc, >mfc_cache_list, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-   rcu_read_unlock();
-
-   spin_lock_bh(_unres_lock);
-   it->cache = >mfc_unres_queue;
-   list_for_each_entry(mfc, it->cache, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-
-   spin_unlock_bh(_unres_lock);
-
-   it->cache = NULL;
-   return NULL;
-}
-
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-   struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -3056,57 +3023,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
loff_t *pos)
if (!mrt)
return ERR_PTR(-ENOENT);
 
-   it->mrt = mrt;
-   it->cache = NULL;
-   return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_mfc_iter *it = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = it->mrt;
-   struct mfc_cache *mfc = v;
-
-   ++*pos;
-
-   if (v == SEQ_START_TOKEN)
-   return ipmr_mfc_seq_idx(net, seq->private, 0);
-
-   if (mfc->_c.list.n

[PATCH net-next 07/11] ipmr, ip6mr: Unite logic for searching in MFC cache

2018-02-27 Thread Yuval Mintz

ipmr and ip6mr utilize the exact same methods for searching the
hashed resolved connections, difference being only in the construction
of the hash comparison key.

In order to unite the flow, introduce an mr_table operation set that
would contain the protocol specific information required for common
flows, in this case - the hash parameters and a comparison key
representing a (*,*) route.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 52 +--
 net/ipv4/ipmr.c | 71 ++-
 net/ipv4/ipmr_base.c| 54 +++--
 net/ipv6/ip6mr.c| 74 +++--
 4 files changed, 134 insertions(+), 117 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 2769e2f..18a1d75 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -89,10 +89,23 @@ struct mr_mfc {
struct rcu_head rcu;
 };
 
+struct mr_table;
+
+/**
+ * struct mr_table_ops - callbacks and info for protocol-specific ops
+ * rht_params: parameters for accessing the MFC hash
+ * cmparg_any: a hash key to be used for matching on (*,*) routes
+ */
+struct mr_table_ops {
+   const struct rhashtable_params *rht_params;
+   void *cmparg_any;
+};
+
 /**
  * struct mr_table - a multicast routing table
  * @list: entry within a list of multicast routing tables
  * @net: net where this table belongs
+ * @op: protocol specific operations
  * @id: identifier of the table
  * @mroute_sk: socket associated with the table
  * @ipmr_expire_timer: timer for handling unresolved routes
@@ -109,6 +122,7 @@ struct mr_mfc {
 struct mr_table {
struct list_headlist;
possible_net_t  net;
+   struct mr_table_ops ops;
u32 id;
struct sock __rcu   *mroute_sk;
struct timer_list   ipmr_expire_timer;
@@ -133,10 +147,19 @@ void vif_device_init(struct vif_device *v,
 
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net));
+
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_find_parent(struct mr_table *mrt,
+void *hasharg, int parent);
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
+
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -147,14 +170,37 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline struct mr_table *
+static inline void *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net))
 {
return NULL;
 }
+
+static inline void *mr_mfc_find_parent(struct mr_table *mrt,
+  void *hasharg, int parent)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_find_any_parent(struct mr_table *mrt,
+  int vifi)
+{
+   return NULL;
+}
+
+static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt,
+int vifi, void *hasharg)
+{
+   return NULL;
+}
 #endif
+
+static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
+{
+   return mr_mfc_find_parent(mrt, hasharg, -1);
+}
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 238a579..d0bf021 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -359,6 +359,16 @@ static void ipmr_new_table_set(struct mr_table *mrt,
 #endif
 }
 
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+   .mfc_mcastgrp = htonl(INADDR_ANY),
+   .mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+   .rht_params = _rht_params,
+   .cmparg_any = _mr_table_ops_cmparg_any,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -371,7 +381,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 
id)
if (mrt)
return mrt;
 
-   return mr_table_alloc(net, id, _rht_params,
+   return mr_table_alloc(net, id, _mr_table_ops,
  ipmr_expire_process, ipmr_new_table_set);
 }
 
@@ -972,33 +982,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table 
*mrt,
.mfc_mc

[PATCH net-next 02/11] ip6mr: Make mroute_sk rcu-based

2018-02-27 Thread Yuval Mintz

In ipmr the mr_table socket is handled under RCU. Introduce the same
for ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute6.h |  6 +++---
 net/ipv6/ip6_output.c   |  2 +-
 net/ipv6/ip6mr.c| 45 +++--
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e5e5b82..e1b9fb0 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -111,12 +111,12 @@ extern int ip6mr_get_route(struct net *net, struct 
sk_buff *skb,
   struct rtmsg *rtm, u32 portid);
 
 #ifdef CONFIG_IPV6_MROUTE
-extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb);
+bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
 extern int ip6mr_sk_done(struct sock *sk);
 #else
-static inline struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 {
-   return NULL;
+   return false;
 }
 static inline int ip6mr_sk_done(struct sock *sk)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 997c7f1..a6eb0e6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
-   ((mroute6_socket(net, skb) &&
+   ((mroute6_is_socket(net, skb) &&
 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 ipv6_chk_mcast_addr(dev, _hdr(skb)->daddr,
 _hdr(skb)->saddr))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e397990..a0e297d 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -58,7 +58,7 @@ struct mr6_table {
struct list_headlist;
possible_net_t  net;
u32 id;
-   struct sock *mroute6_sk;
+   struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
struct list_headmfc6_cache_array[MFC6_LINES];
@@ -1121,6 +1121,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr6_table *mrt,
 static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert)
 {
+   struct sock *mroute6_sk;
struct sk_buff *skb;
struct mrt6msg *msg;
int ret;
@@ -1190,17 +1191,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, 
struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
 
-   if (!mrt->mroute6_sk) {
+   rcu_read_lock();
+   mroute6_sk = rcu_dereference(mrt->mroute6_sk);
+   if (!mroute6_sk) {
+   rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
 
mrt6msg_netlink_event(mrt, skb);
 
-   /*
-*  Deliver to user space multicast routing algorithms
-*/
-   ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+   /* Deliver to user space multicast routing algorithms */
+   ret = sock_queue_rcv_skb(mroute6_sk, skb);
+   rcu_read_unlock();
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping 
entries\n");
kfree_skb(skb);
@@ -1584,11 +1587,11 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct 
sock *sk)
 
rtnl_lock();
write_lock_bh(_lock);
-   if (likely(mrt->mroute6_sk == NULL)) {
-   mrt->mroute6_sk = sk;
-   net->ipv6.devconf_all->mc_forwarding++;
-   } else {
+   if (rtnl_dereference(mrt->mroute6_sk)) {
err = -EADDRINUSE;
+   } else {
+   rcu_assign_pointer(mrt->mroute6_sk, sk);
+   net->ipv6.devconf_all->mc_forwarding++;
}
write_unlock_bh(_lock);
 
@@ -1614,9 +1617,9 @@ int ip6mr_sk_done(struct sock *sk)
 
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
-   if (sk == mrt->mroute6_sk) {
+   if (sk == rtnl_dereference(mrt->mroute6_sk)) {
write_lock_bh(_lock);
-   mrt->mroute6_sk = NULL;
+   RCU_INIT_POINTER(mrt->mroute6_sk, NULL);
net->ipv6.devconf_all->mc_forwarding--;
write_unlock_bh(_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1630,11 +1633,12 @@ int ip6mr_sk_done(struct sock *sk)
}
}
rtnl_unlock();
+   synchronize_rcu();
 
return err;
 }
 
-struct sock *mroute6_socket(struct

[PATCH net-next 09/11] ipmr, ip6mr: Unite vif seq functions

2018-02-27 Thread Yuval Mintz

Same as previously done with the mfc seq, the logic for the vif seq is
refactored to be shared between ipmr and ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 33 ++
 net/ipv4/ipmr.c | 49 +---
 net/ipv4/ipmr_base.c| 33 ++
 net/ipv6/ip6mr.c| 50 +
 4 files changed, 76 insertions(+), 89 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 413f103..edc6e6b 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -206,6 +206,12 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 }
 
 #ifdef CONFIG_PROC_FS
+struct mr_vif_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   int ct;
+};
+
 struct mr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
@@ -216,6 +222,16 @@ struct mr_mfc_iter {
 };
 
 #ifdef CONFIG_IP_MROUTE_COMMON
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos);
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return *pos ? mr_vif_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
 /* These actually return 'struct mr_mfc *', but to avoid need for explicit
  * castings they simply return void.
  */
@@ -249,6 +265,23 @@ static inline void mr_mfc_seq_stop(struct seq_file *seq, 
void *v)
rcu_read_unlock();
 }
 #else
+static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter,
+  loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_next(struct seq_file *seq,
+   void *v, loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return NULL;
+}
+
 static inline void *mr_mfc_seq_idx(struct net *net,
   struct mr_mfc_iter *it, loff_t pos)
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 22b47e0..0363524 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2908,31 +2908,11 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, 
struct netlink_callback *cb)
 /* The /proc interfaces to multicast routing :
  * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
-struct ipmr_vif_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   int ct;
-};
-
-static struct vif_device *ipmr_vif_seq_idx(struct net *net,
-   struct ipmr_vif_iter *iter,
-   loff_t pos)
-{
-   struct mr_table *mrt = iter->mrt;
-
-   for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   if (pos-- == 0)
-   return >vif_table[iter->ct];
-   }
-   return NULL;
-}
 
 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(mrt_lock)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -2943,26 +2923,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, 
loff_t *pos)
iter->mrt = mrt;
 
read_lock(_lock);
-   return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_vif_iter *iter = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = iter->mrt;
-
-   ++*pos;
-   if (v == SEQ_START_TOKEN)
-   return ipmr_vif_seq_idx(net, iter, 0);
-
-   while (++iter->ct < mrt->maxvif) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   return >vif_table[iter->ct];
-   }
-   return NULL;
+   return mr_vif_seq_start(seq, pos);
 }
 
 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
@@ -2973,7 +2934,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void 
*v)
 
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct mr_table *mrt = iter->mrt;
 
if (v == SEQ_START_TOKEN) {
@@ -2996,7 +2957,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void 
*v)
 
 static const struct seq_operations ipmr_vif_seq_ops = {
.start = ipmr_vif_seq_start,
-   .next  = ipmr_vif_seq_next

[PATCH net-next 01/11] ipmr,ipmr6: Define a uniform vif_device

2018-02-27 Thread Yuval Mintz

The two implementations have almost identical structures - vif_device and
mif_device. As a step toward uniforming the mr_tables, eliminate the
mif_device and relocate the vif_device definition into a new common
header file.

Also, introduce a common initializing function for setting most of the
vif_device fields in a new common source file. This requires modifying
the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common
config option - CONFIG_IP_MROUTE_COMMON.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  | 13 +---
 include/linux/mroute6.h | 11 +-
 include/linux/mroute_base.h | 52 +
 net/ipv4/Kconfig|  5 +
 net/ipv4/Makefile   |  1 +
 net/ipv4/ipmr.c | 32 +---
 net/ipv4/ipmr_base.c| 28 
 net/ipv6/Kconfig|  1 +
 net/ipv6/ip6mr.c| 37 
 9 files changed, 117 insertions(+), 63 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5396521..b8aadff 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IP_MROUTE
 static inline int ip_mroute_opt(int opt)
@@ -56,18 +57,6 @@ static inline bool ipmr_rule_default(const struct fib_rule 
*rule)
 }
 #endif
 
-struct vif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   struct netdev_phys_item_id dev_parent_id;   /* Device parent ID
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   __be32  local,remote;   /* Addresses(remote for 
tunnels)*/
-   int link;   /* Physical interface index 
*/
-};
-
 struct vif_entry_notifier_info {
struct fib_notifier_info info;
struct net_device *dev;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 3014c52..e5e5b82 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -7,6 +7,7 @@
 #include   /* for struct sk_buff_head */
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -62,16 +63,6 @@ static inline void ip6_mr_cleanup(void)
 }
 #endif
 
-struct mif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   int link;   /* Physical interface index 
*/
-};
-
 #define VIFF_STATIC 0x8000
 
 struct mfc6_cache {
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
new file mode 100644
index 000..0de651e
--- /dev/null
+++ b/include/linux/mroute_base.h
@@ -0,0 +1,52 @@
+#ifndef __LINUX_MROUTE_BASE_H
+#define __LINUX_MROUTE_BASE_H
+
+#include 
+
+/**
+ * struct vif_device - interface representor for multicast routing
+ * @dev: network device being used
+ * @bytes_in: statistic; bytes ingressing
+ * @bytes_out: statistic; bytes egresing
+ * @pkt_in: statistic; packets ingressing
+ * @pkt_out: statistic; packets egressing
+ * @rate_limit: Traffic shaping (NI)
+ * @threshold: TTL threshold
+ * @flags: Control flags
+ * @link: Physical interface index
+ * @dev_parent_id: device parent id
+ * @local: Local address
+ * @remote: Remote address for tunnels
+ */
+struct vif_device {
+   struct net_device *dev;
+   unsigned long bytes_in, bytes_out;
+   unsigned long pkt_in, pkt_out;
+   unsigned long rate_limit;
+   unsigned char threshold;
+   unsigned short flags;
+   int link;
+
+   /* Currently only used by ipmr */
+   struct netdev_phys_item_id dev_parent_id;
+   __be32 local, remote;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+void vif_device_init(struct vif_device *v,
+struct net_device *dev,
+unsigned long rate_limit,
+unsigned char threshold,
+unsigned short flags,
+unsigned short get_iflink_mask);
+#else
+static inline void vif_device_init(struct vif_device *v,
+  struct net_devic

[PATCH net-next 05/11] ipmr, ip6mr: Unite creation of new mr_table

2018-02-27 Thread Yuval Mintz

Now that both ipmr and ip6mr are using the same mr_table structure,
we can have a common function to allocate & initialize a new instance.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 17 +
 net/ipv4/ipmr.c | 27 ++-
 net/ipv4/ipmr_base.c| 27 +++
 net/ipv6/ip6mr.c| 30 ++
 4 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 1cc944a..8053057 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -85,6 +85,13 @@ void vif_device_init(struct vif_device *v,
 unsigned char threshold,
 unsigned short flags,
 unsigned short get_iflink_mask);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net));
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -94,5 +101,15 @@ static inline void vif_device_init(struct vif_device *v,
   unsigned short get_iflink_mask)
 {
 }
+
+static inline struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   return NULL;
+}
 #endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 78046d2..09f965d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -351,6 +351,14 @@ static const struct rhashtable_params ipmr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ipmr_new_table_set(struct mr_table *mrt,
+  struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv4.mr_tables);
+#endif
+}
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -363,23 +371,8 @@ static struct mr_table *ipmr_new_table(struct net *net, 
u32 id)
if (mrt)
return mrt;
 
-   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-   if (!mrt)
-   return ERR_PTR(-ENOMEM);
-   write_pnet(>net, net);
-   mrt->id = id;
-
-   rhltable_init(>mfc_hash, _rht_params);
-   INIT_LIST_HEAD(>mfc_cache_list);
-   INIT_LIST_HEAD(>mfc_unres_queue);
-
-   timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
-
-   mrt->mroute_reg_vif_num = -1;
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
-   list_add_tail_rcu(>list, >ipv4.mr_tables);
-#endif
-   return mrt;
+   return mr_table_alloc(net, id, _rht_params,
+ ipmr_expire_process, ipmr_new_table_set);
 }
 
 static void ipmr_free_table(struct mr_table *mrt)
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 22758f8..3e21a58 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -26,3 +26,30 @@ void vif_device_init(struct vif_device *v,
v->link = dev->ifindex;
 }
 EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   struct mr_table *mrt;
+
+   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+   if (!mrt)
+   return NULL;
+   mrt->id = id;
+   write_pnet(>net, net);
+
+   rhltable_init(>mfc_hash, rht_params);
+   INIT_LIST_HEAD(>mfc_cache_list);
+   INIT_LIST_HEAD(>mfc_unres_queue);
+
+   timer_setup(>ipmr_expire_timer, expire_func, 0);
+
+   mrt->mroute_reg_vif_num = -1;
+   table_set(mrt, net);
+   return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index adbb826..d508528 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -295,6 +294,14 @@ static const struct rhashtable_params ip6mr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ip6mr_new_table_set(struct mr_table *mrt,
+   struct net *net)
+{
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv6.mr6_tables);
+#endif
+}
+
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -303,25 +310,8 @@ static struct mr_table *ip6mr_new

[PATCH net-next 00/11] ipmr, ip6mr: Align multicast routing for IPv4 & IPv6

2018-02-27 Thread Yuval Mintz

Historically ip6mr was based [cut-n-paste] on ipmr and the two have not
diverged too much. Apparently as ipv4 multicast routing is more common
than its ipv6 brethren modifications since then are mostly one-way,
affecting ipmr while leaving ip6mr unchanged.

This series is meant to re-factor both ipmr and ip6mr into having common
structures [and some functionality], adding 2 new common files -
mroute_base.h and ipmr_base.c.

The series begins by bringing ip6mr up to speed to some of the changes
applied in the past to ipmr [#2, #3].
It is then possible to re-factor a lot of the common structures - 
vif devices [#1], mr_table [#4] mfc_cache [#6], and use the common
structures in both ipmr and ip6mr.

The rest of the patches re-factor some choice flows used by both ipmr
and ip6mr and eliminates duplicity.

This series would later allow for easy extension of ipmr offloading
to support ip6mr offloading as well, as almost all structures
related to the offloading would be shared between the two protocols.

Changes from previous versions
--
RFC -> v1:
  - Corrected support for CONFIG_IP{,V6}_MROUTE_MULTIPLE_TABLES
  - Addressed a couple of kbuild test robot issues

Yuval Mintz (11):
  ipmr,ipmr6: Define a uniform vif_device
  ip6mr: Make mroute_sk rcu-based
  ip6mr: Align hash implementation to ipmr
  mroute*: Make mr_table a common struct
  ipmr, ip6mr: Unite creation of new mr_table
  ipmr, ip6mr: Make mfc_cache a common structure
  ipmr, ip6mr: Unite logic for searching in MFC cache
  ipmr, ip6mr: Unite mfc seq logic
  ipmr, ip6mr: Unite vif seq functions
  ip6mr: Remove MFC_NOTIFY and refactor flags
  ipmr, ip6mr: Unite dumproute flows

 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  88 +-
 include/linux/mroute6.h   |  62 +-
 include/linux/mroute_base.h   | 346 
 include/net/netns/ipv6.h  |   2 +-
 net/ipv4/Kconfig  |   5 +
 net/ipv4/Makefile |   1 +
 net/ipv4/ipmr.c   | 583 -
 net/ipv4/ipmr_base.c  | 323 +++
 net/ipv6/Kconfig  |   1 +
 net/ipv6/ip6_output.c |   2 +-
 net/ipv6/ip6mr.c  | 983 --
 12 files changed, 1250 insertions(+), 1167 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

-- 
2.4.3

[RFC net-next 03/11] ip6mr: Align hash implementation to ipmr

2018-02-20 Thread Yuval Mintz

Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has
been using rhashtable as a basis for its mfc routes, but ip6mr is
currently still using the old private MFC hash implementation.

Align ip6mr to the current ipmr implementation.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute6.h |  30 ++---
 net/ipv6/ip6mr.c| 313 ++--
 2 files changed, 184 insertions(+), 159 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e1b9fb0..e2dac19 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -65,10 +66,20 @@ static inline void ip6_mr_cleanup(void)
 
 #define VIFF_STATIC 0x8000
 
+struct mfc6_cache_cmp_arg {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+};
+
 struct mfc6_cache {
-   struct list_head list;
-   struct in6_addr mf6c_mcastgrp;  /* Group the entry 
belongs to   */
-   struct in6_addr mf6c_origin;/* Source of packet 
*/
+   struct rhlist_head mnode;
+   union {
+   struct {
+   struct in6_addr mf6c_mcastgrp;
+   struct in6_addr mf6c_origin;
+   };
+   struct mfc6_cache_cmp_arg cmparg;
+   };
mifi_t mf6c_parent; /* Source interface 
*/
int mfc_flags;  /* Flags on line
*/
 
@@ -88,22 +99,13 @@ struct mfc6_cache {
unsigned char ttls[MAXMIFS];/* TTL thresholds   
*/
} res;
} mfc_un;
+   struct list_head list;
+   struct rcu_head rcu;
 };
 
 #define MFC_STATIC 1
 #define MFC_NOTIFY 2
 
-#define MFC6_LINES 64
-
-#define MFC6_HASH(a, g) (((__force u32)(a)->s6_addr32[0] ^ \
- (__force u32)(a)->s6_addr32[1] ^ \
- (__force u32)(a)->s6_addr32[2] ^ \
- (__force u32)(a)->s6_addr32[3] ^ \
- (__force u32)(g)->s6_addr32[0] ^ \
- (__force u32)(g)->s6_addr32[1] ^ \
- (__force u32)(g)->s6_addr32[2] ^ \
- (__force u32)(g)->s6_addr32[3]) % MFC6_LINES)
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7792fc5..717370e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -61,8 +61,9 @@ struct mr6_table {
struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
-   struct list_headmfc6_cache_array[MFC6_LINES];
struct vif_device   vif6_table[MAXMIFS];
+   struct rhltable mfc6_hash;
+   struct list_headmfc6_cache_list;
int maxvif;
atomic_tcache_resolve_queue_len;
boolmroute_do_assert;
@@ -299,10 +300,29 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 }
 #endif
 
+static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+   const struct mfc6_cache_cmp_arg *cmparg = arg->key;
+   struct mfc6_cache *c = (struct mfc6_cache *)ptr;
+
+   return !ipv6_addr_equal(>mf6c_mcastgrp, >mf6c_mcastgrp) ||
+  !ipv6_addr_equal(>mf6c_origin, >mf6c_origin);
+}
+
+static const struct rhashtable_params ip6mr_rht_params = {
+   .head_offset = offsetof(struct mfc6_cache, mnode),
+   .key_offset = offsetof(struct mfc6_cache, cmparg),
+   .key_len = sizeof(struct mfc6_cache_cmp_arg),
+   .nelem_hint = 3,
+   .locks_mul = 1,
+   .obj_cmpfn = ip6mr_hash_cmp,
+   .automatic_shrinking = true,
+};
+
 static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr6_table *mrt;
-   unsigned int i;
 
mrt = ip6mr_get_table(net, id);
if (mrt)
@@ -314,10 +334,8 @@ static struct mr6_table *ip6mr_new_table(struct net *net, 
u32 id)
mrt->id = id;
write_pnet(>net, net);
 
-   /* Forwarding cache */
-   for (i = 0; i < MFC6_LINES; i++)
-   INIT_LIST_HEAD(>mfc6_cache_array[i]);
-
+   rhltable_init(>mfc6_hash, _rht_params);
+   INIT_LIST_HEAD(>mfc6_cache_list);
INIT_LIST_HEAD(>mfc6_unres_queue);
 
timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
@@ -335,6 +353,7 @@ static void ip6mr_free_table(struct mr6_table *mrt)
 {
del_timer_sync(>ipmr_expire_timer);
mroute_clean_tables(mrt, true);
+   rhltable_destroy(>mfc6_hash

[RFC net-next 09/11] ipmr, ip6mr: Unite vif seq functions

2018-02-20 Thread Yuval Mintz

Same as previously done with the mfc seq, the logic for the vif seq is
refactored to be shared between ipmr and ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 33 ++
 net/ipv4/ipmr.c | 49 +---
 net/ipv4/ipmr_base.c| 33 ++
 net/ipv6/ip6mr.c| 50 +
 4 files changed, 76 insertions(+), 89 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 413f103..edc6e6b 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -206,6 +206,12 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 }
 
 #ifdef CONFIG_PROC_FS
+struct mr_vif_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   int ct;
+};
+
 struct mr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
@@ -216,6 +222,16 @@ struct mr_mfc_iter {
 };
 
 #ifdef CONFIG_IP_MROUTE_COMMON
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos);
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return *pos ? mr_vif_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
 /* These actually return 'struct mr_mfc *', but to avoid need for explicit
  * castings they simply return void.
  */
@@ -249,6 +265,23 @@ static inline void mr_mfc_seq_stop(struct seq_file *seq, 
void *v)
rcu_read_unlock();
 }
 #else
+static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter,
+  loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_next(struct seq_file *seq,
+   void *v, loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+   return NULL;
+}
+
 static inline void *mr_mfc_seq_idx(struct net *net,
   struct mr_mfc_iter *it, loff_t pos)
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 0281f89..6039751 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2908,31 +2908,11 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, 
struct netlink_callback *cb)
 /* The /proc interfaces to multicast routing :
  * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
-struct ipmr_vif_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   int ct;
-};
-
-static struct vif_device *ipmr_vif_seq_idx(struct net *net,
-   struct ipmr_vif_iter *iter,
-   loff_t pos)
-{
-   struct mr_table *mrt = iter->mrt;
-
-   for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   if (pos-- == 0)
-   return >vif_table[iter->ct];
-   }
-   return NULL;
-}
 
 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(mrt_lock)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -2943,26 +2923,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, 
loff_t *pos)
iter->mrt = mrt;
 
read_lock(_lock);
-   return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_vif_iter *iter = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = iter->mrt;
-
-   ++*pos;
-   if (v == SEQ_START_TOKEN)
-   return ipmr_vif_seq_idx(net, iter, 0);
-
-   while (++iter->ct < mrt->maxvif) {
-   if (!VIF_EXISTS(mrt, iter->ct))
-   continue;
-   return >vif_table[iter->ct];
-   }
-   return NULL;
+   return mr_vif_seq_start(seq, pos);
 }
 
 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
@@ -2973,7 +2934,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void 
*v)
 
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
-   struct ipmr_vif_iter *iter = seq->private;
+   struct mr_vif_iter *iter = seq->private;
struct mr_table *mrt = iter->mrt;
 
if (v == SEQ_START_TOKEN) {
@@ -2996,7 +2957,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void 
*v)
 
 static const struct seq_operations ipmr_vif_seq_ops = {
.start = ipmr_vif_seq_start,
-   .next  = ipmr_vif_seq_next

[RFC net-next 06/11] ipmr, ip6mr: Make mfc_cache a common structure

2018-02-20 Thread Yuval Mintz

mfc_cache and mfc6_cache are almost identical - the main difference is
in the origin/group addresses and comparison-key. Make a common
structure encapsulating most of the multicast routing logic  - mr_mfc
and convert both ipmr and ip6mr into using it.

For easy conversion [casting, in this case] mr_mfc has to be the first
field inside every multicast routing abstraction utilizing it.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  45 +---
 include/linux/mroute6.h   |  23 +--
 include/linux/mroute_base.h   |  45 
 net/ipv4/ipmr.c   | 234 +++--
 net/ipv6/ip6mr.c  | 241 +++---
 6 files changed, 312 insertions(+), 297 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
index d20b143..978a3c7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -126,8 +126,8 @@ mlxsw_sp_mr_route_ivif_in_evifs(const struct 
mlxsw_sp_mr_route *mr_route)
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   ivif = mr_route->mfc4->mfc_parent;
-   return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
+   ivif = mr_route->mfc4->_c.mfc_parent;
+   return mr_route->mfc4->_c.mfc_un.res.ttls[ivif] != 255;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
default:
@@ -364,7 +364,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->mfc4 = mfc;
mr_route->mr_table = mr_table;
for (i = 0; i < MAXVIFS; i++) {
-   if (mfc->mfc_un.res.ttls[i] != 255) {
+   if (mfc->_c.mfc_un.res.ttls[i] != 255) {
err = mlxsw_sp_mr_route_evif_link(mr_route,
  _table->vifs[i]);
if (err)
@@ -374,7 +374,8 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table 
*mr_table,
mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
}
}
-   mlxsw_sp_mr_route_ivif_link(mr_route, _table->vifs[mfc->mfc_parent]);
+   mlxsw_sp_mr_route_ivif_link(mr_route,
+   _table->vifs[mfc->_c.mfc_parent]);
 
mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
return mr_route;
@@ -418,9 +419,9 @@ static void mlxsw_sp_mr_mfc_offload_set(struct 
mlxsw_sp_mr_route *mr_route,
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
if (offload)
-   mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags |= MFC_OFFLOAD;
else
-   mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
+   mr_route->mfc4->_c.mfc_flags &= ~MFC_OFFLOAD;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
@@ -943,10 +944,10 @@ static void mlxsw_sp_mr_route_stats_update(struct 
mlxsw_sp *mlxsw_sp,
 
switch (mr_route->mr_table->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
-   if (mr_route->mfc4->mfc_un.res.pkt != packets)
-   mr_route->mfc4->mfc_un.res.lastuse = jiffies;
-   mr_route->mfc4->mfc_un.res.pkt = packets;
-   mr_route->mfc4->mfc_un.res.bytes = bytes;
+   if (mr_route->mfc4->_c.mfc_un.res.pkt != packets)
+   mr_route->mfc4->_c.mfc_un.res.lastuse = jiffies;
+   mr_route->mfc4->_c.mfc_un.res.pkt = packets;
+   mr_route->mfc4->_c.mfc_un.res.bytes = bytes;
break;
case MLXSW_SP_L3_PROTO_IPV6:
/* fall through */
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 8688c5d..63b36e6 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -81,28 +81,13 @@ struct mfc_cache_cmp_arg {
 
 /**
  * struct mfc_cache - multicast routing entries
- * @mnode: rhashtable list
+ * @_c: Common multicast routing information; has to be first [for casting]
  * @mfc_mcastgrp: destination multicast group address
  * @mfc_origin: source address
  * @cmparg: used for rhashtable comparisons
- * @mfc_parent: source interface (iif)
- * @mfc_flags: entry flags
- * @expires: unresolved entry expire time
- * @unresolved: unresolved cached skbs
- * @last_assert: time of last assert
- * @minvif: minimum VIF id
- * @maxvif: maximum VIF id
- * @bytes: bytes that have passed for this entry
- * @pkt: packets that have pas

[RFC net-next 00/11] ipmr, ip6mr: Align multicast routing for IPv4 & IPv6

2018-02-20 Thread Yuval Mintz

Historically ip6mr was based [cut-n-paste] on ipmr and the two have not
diverged too much. Apparently as ipv4 multicast routing is more common
than its ipv6 brethren modifications since then are mostly one-way,
affecting ipmr while leaving ip6mr unchanged.

This series is meant to re-factor both ipmr and ip6mr into having common
structures [and some functionality], adding 2 new common files -
mroute_base.h and ipmr_base.c.

The series begins by bringing ip6mr up to speed to some of the changes
applied in the past to ipmr [#2, #3].
It is then possible to re-factor a lot of the common structures - 
vif devices [#1], mr_table [#4] mfc_cache [#6], and use the common
structures in both ipmr and ip6mr.

The rest of the patches re-factor some choice flows used by both ipmr
and ip6mr and eliminates duplicity.

This series would later allow for easy extension of ipmr offloading
to support ip6mr offloading as well, as almost all structures
related to the offloading would be shared between the two protocols.

Yuval Mintz (11):
  ipmr,ipmr6: Define a uniform vif_device
  ip6mr: Make mroute_sk rcu-based
  ip6mr: Align hash implementation to ipmr
  mroute*: Make mr_table a common struct
  ipmr, ip6mr: Unite creation of new mr_table
  ipmr, ip6mr: Make mfc_cache a common structure
  ipmr, ip6mr: Unite logic for searching in MFC cache
  ipmr, ip6mr: Unite mfc seq logic
  ipmr, ip6mr: Unite vif seq logic
  ip6mr: Remove MFC_NOTIFY and refactor flags
  ipmr, ip6mr: Unite dumproute flows

 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c |  21 +-
 include/linux/mroute.h|  88 +-
 include/linux/mroute6.h   |  62 +-
 include/linux/mroute_base.h   | 346 
 include/net/netns/ipv6.h  |   2 +-
 net/ipv4/Kconfig  |   5 +
 net/ipv4/Makefile |   1 +
 net/ipv4/ipmr.c   | 576 -
 net/ipv4/ipmr_base.c  | 323 +++
 net/ipv6/Kconfig  |   1 +
 net/ipv6/ip6_output.c |   2 +-
 net/ipv6/ip6mr.c  | 984 --
 12 files changed, 1240 insertions(+), 1171 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

-- 
2.4.3

[RFC net-next 11/11] ipmr, ip6mr: Unite dumproute flows

2018-02-20 Thread Yuval Mintz

The various MFC entries are being held in the same kind of mr_tables
for both ipmr and ip6mr, and their traversal logic is identical.
Also, with the exception of the addresses [and other small tidbits]
the major bulk of the nla setting is identical.

Unite as much of the dumping as possible between the two.
Notice this requires creating an mr_table iterator for each, as the
for-each preprocessor macro can't be used by the common logic.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h |  29 +
 net/ipv4/ipmr.c | 154 +---
 net/ipv4/ipmr_base.c| 123 +++
 net/ipv6/ip6mr.c| 149 +-
 4 files changed, 216 insertions(+), 239 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 2054118..2da30da 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -170,6 +170,16 @@ void *mr_mfc_find_parent(struct mr_table *mrt,
 void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
 void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
 
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+  struct mr_mfc *c, struct rtmsg *rtm);
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock);
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -207,6 +217,25 @@ static inline struct mr_mfc *mr_mfc_find_any(struct 
mr_table *mrt,
 {
return NULL;
 }
+
+static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+struct mr_mfc *c, struct rtmsg *rtm)
+{
+   return -EINVAL;
+}
+
+static inline int
+mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+int (*fill)(struct mr_table *mrt,
+struct sk_buff *skb,
+u32 portid, u32 seq, struct mr_mfc *c,
+int cmd, int flags),
+spinlock_t *lock)
+{
+   return -EINVAL;
+}
 #endif
 
 static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 6039751..446b372 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -104,8 +104,6 @@ static void ip_mr_forward(struct net *net, struct mr_table 
*mrt,
  struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct mr_table *mrt,
 struct sk_buff *pkt, vifi_t vifi, int assert);
-static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- struct mr_mfc *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -116,6 +114,16 @@ static void ipmr_expire_process(struct timer_list *t);
 #define ipmr_for_each_table(mrt, net) \
list_for_each_entry_rcu(mrt, >ipv4.mr_tables, list)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   if (!mrt)
+   return list_entry_rcu(>ipv4.mr_tables->next,
+ typeof(mrt), list);
+   return list_entry_rcu(mrt->list.next,
+ typeof(mrt), list);
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -283,6 +291,14 @@ EXPORT_SYMBOL(ipmr_rule_default);
 #define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+  struct mr_table *mrt)
+{
+   if (!mrt)
+   return net->ipv4.mrt;
+   return NULL;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
return net->ipv4.mrt;
@@ -1050,8 +1066,8 @@ static void ipmr_cache_resolve(struct net *net, struct 
mr_table *mrt,
struct nlmsghdr *nlh = skb_pull(skb,
sizeof(struct iphdr));
 
-   if (__ipmr_fill_mroute(mrt, skb, >_c,
-

[RFC net-next 04/11] mroute*: Make mr_table a common struct

2018-02-20 Thread Yuval Mintz

Following previous changes to ip6mr, mr_table and mr6_table are
basically the same [up to mr6_table having additional '6' suffixes to
its variable names].
Move the common structure definition into a common header; This
requires renaming all references in ip6mr to variables that had the
distinct suffix.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  |  21 ---
 include/linux/mroute6.h |   1 -
 include/linux/mroute_base.h |  46 +++
 include/net/netns/ipv6.h|   2 +-
 net/ipv4/ipmr.c |   2 -
 net/ipv6/ip6mr.c| 311 
 6 files changed, 191 insertions(+), 192 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b8aadff..8688c5d 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -4,8 +4,6 @@
 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
@@ -67,25 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-
-struct mr_table {
-   struct list_headlist;
-   possible_net_t  net;
-   u32 id;
-   struct sock __rcu   *mroute_sk;
-   struct timer_list   ipmr_expire_timer;
-   struct list_headmfc_unres_queue;
-   struct vif_device   vif_table[MAXVIFS];
-   struct rhltable mfc_hash;
-   struct list_headmfc_cache_list;
-   int maxvif;
-   atomic_tcache_resolve_queue_len;
-   boolmroute_do_assert;
-   boolmroute_do_pim;
-   int mroute_reg_vif_num;
-};
-
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
  * MFC_OFFLOAD - the entry was offloaded to the hardware
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e2dac19..d5c8dc1 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 0de651e..1cc944a 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -2,6 +2,9 @@
 #define __LINUX_MROUTE_BASE_H
 
 #include 
+#include 
+#include 
+#include 
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -32,6 +35,49 @@ struct vif_device {
__be32 local, remote;
 };
 
+#ifndef MAXVIFS
+/* This one is nasty; value is defined in uapi using different symbols for
+ * mroute and morute6 but both map into same 32.
+ */
+#define MAXVIFS32
+#endif
+
+#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
+
+/**
+ * struct mr_table - a multicast routing table
+ * @list: entry within a list of multicast routing tables
+ * @net: net where this table belongs
+ * @id: identifier of the table
+ * @mroute_sk: socket associated with the table
+ * @ipmr_expire_timer: timer for handling unresolved routes
+ * @mfc_unres_queue: list of unresolved MFC entries
+ * @vif_table: array containing all possible vifs
+ * @mfc_hash: Hash table of all resolved routes for easy lookup
+ * @mfc_cache_list: list of resovled routes for possible traversal
+ * @maxvif: Identifier of highest value vif currently in use
+ * @cache_resolve_queue_len: current size of unresolved queue
+ * @mroute_do_assert: Whether to inform userspace on wrong ingress
+ * @mroute_do_pim: Whether to receive IGMP PIMv1
+ * @mroute_reg_vif_num: PIM-device vif index
+ */
+struct mr_table {
+   struct list_headlist;
+   possible_net_t  net;
+   u32 id;
+   struct sock __rcu   *mroute_sk;
+   struct timer_list   ipmr_expire_timer;
+   struct list_headmfc_unres_queue;
+   struct vif_device   vif_table[MAXVIFS];
+   struct rhltable mfc_hash;
+   struct list_headmfc_cache_list;
+   int maxvif;
+   atomic_tcache_resolve_queue_len;
+   boolmroute_do_assert;
+   boolmroute_do_pim;
+   int mroute_reg_vif_num;
+};
+
 #ifdef CONFIG_IP_MROUTE_COMMON
 void vif_device_init(struct vif_device *v,
 struct net_device *dev,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc45..0d177fa9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -84,7 +84,7 @@ struct netns_ipv6 {
struct sock *mc_autojoin_sk;
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-   struct mr6_table*mrt6;
+   struct mr_table *mrt6;
 #else
struct list_headmr6_tables;
struct fib_rules_ops*mr6_rules_ops;
diff --git a/net/ipv4/ipmr.

[RFC net-next 05/11] ipmr, ip6mr: Unite creation of new mr_table

2018-02-20 Thread Yuval Mintz

Now that both ipmr and ip6mr are using the same mr_table structure,
we can have a common function to allocate & initialize a new instance.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 17 +
 net/ipv4/ipmr.c | 27 ++-
 net/ipv4/ipmr_base.c| 27 +++
 net/ipv6/ip6mr.c| 30 ++
 4 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 1cc944a..8053057 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -85,6 +85,13 @@ void vif_device_init(struct vif_device *v,
 unsigned char threshold,
 unsigned short flags,
 unsigned short get_iflink_mask);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net));
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -94,5 +101,15 @@ static inline void vif_device_init(struct vif_device *v,
   unsigned short get_iflink_mask)
 {
 }
+
+static inline struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   return NULL;
+}
 #endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 78046d2..67f752f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -351,6 +351,14 @@ static const struct rhashtable_params ipmr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ipmr_new_table_set(struct mr_table *mr,
+  struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv4.mr_tables);
+#endif
+}
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -363,23 +371,8 @@ static struct mr_table *ipmr_new_table(struct net *net, 
u32 id)
if (mrt)
return mrt;
 
-   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-   if (!mrt)
-   return ERR_PTR(-ENOMEM);
-   write_pnet(>net, net);
-   mrt->id = id;
-
-   rhltable_init(>mfc_hash, _rht_params);
-   INIT_LIST_HEAD(>mfc_cache_list);
-   INIT_LIST_HEAD(>mfc_unres_queue);
-
-   timer_setup(>ipmr_expire_timer, ipmr_expire_process, 0);
-
-   mrt->mroute_reg_vif_num = -1;
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
-   list_add_tail_rcu(>list, >ipv4.mr_tables);
-#endif
-   return mrt;
+   return mr_table_alloc(net, id, _rht_params,
+ ipmr_expire_process, ipmr_new_table_set);
 }
 
 static void ipmr_free_table(struct mr_table *mrt)
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 22758f8..3e21a58 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -26,3 +26,30 @@ void vif_device_init(struct vif_device *v,
v->link = dev->ifindex;
 }
 EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+  const struct rhashtable_params *rht_params,
+  void (*expire_func)(struct timer_list *t),
+  void (*table_set)(struct mr_table *mrt,
+struct net *net))
+{
+   struct mr_table *mrt;
+
+   mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+   if (!mrt)
+   return NULL;
+   mrt->id = id;
+   write_pnet(>net, net);
+
+   rhltable_init(>mfc_hash, rht_params);
+   INIT_LIST_HEAD(>mfc_cache_list);
+   INIT_LIST_HEAD(>mfc_unres_queue);
+
+   timer_setup(>ipmr_expire_timer, expire_func, 0);
+
+   mrt->mroute_reg_vif_num = -1;
+   table_set(mrt, net);
+   return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0095a43..9b11321 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -295,6 +294,14 @@ static const struct rhashtable_params ip6mr_rht_params = {
.automatic_shrinking = true,
 };
 
+static void ip6mr_new_table_set(struct mr_table *mrt,
+   struct net *net)
+{
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+   list_add_tail_rcu(>list, >ipv6.mr_tables);
+#endif
+}
+
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -303,25 +310,8 @@ static struct mr_table *ip6mr_new

[RFC net-next 10/11] ip6mr: Remove MFC_NOTIFY and refactor flags

2018-02-20 Thread Yuval Mintz

MFC_NOTIFY exists in ip6mr, probably as some legacy code
[was already removed for ipmr in commit
06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags 
enum").
Remove it from ip6mr as well, and move the enum into a common file;
Notice MFC_OFFLOAD is currently only used by ipmr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  | 9 -
 include/linux/mroute6.h | 3 ---
 include/linux/mroute_base.h | 9 +
 net/ipv6/ip6mr.c| 3 ---
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 63b36e6..7ed82e4 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -65,15 +65,6 @@ struct vif_entry_notifier_info {
 
 #define VIFF_STATIC 0x8000
 
-/* mfc_flags:
- * MFC_STATIC - the entry was added statically (not by a routing daemon)
- * MFC_OFFLOAD - the entry was offloaded to the hardware
- */
-enum {
-   MFC_STATIC = BIT(0),
-   MFC_OFFLOAD = BIT(1),
-};
-
 struct mfc_cache_cmp_arg {
__be32 mfc_mcastgrp;
__be32 mfc_origin;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 6acf576..1ac38e6 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -81,9 +81,6 @@ struct mfc6_cache {
};
 };
 
-#define MFC_STATIC 1
-#define MFC_NOTIFY 2
-
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
 
 struct rtmsg;
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index edc6e6b..2054118 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -45,6 +45,15 @@ struct vif_device {
 
 #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))
 
+/* mfc_flags:
+ * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
+ */
+enum {
+   MFC_STATIC = BIT(0),
+   MFC_OFFLOAD = BIT(1),
+};
+
 /**
  * struct mr_mfc - common multicast routing entries
  * @mnode: rhashtable list
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0df242..1717638 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2208,9 +2208,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, 
struct rtmsg *rtm,
return err;
}
 
-   if (rtm->rtm_flags & RTM_F_NOTIFY)
-   cache->_c.mfc_flags |= MFC_NOTIFY;
-
err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(_lock);
return err;
-- 
2.4.3

[RFC net-next 08/11] ipmr, ip6mr: Unite mfc seq logic

2018-02-20 Thread Yuval Mintz

With the exception of the final dump, ipmr and ip6mr have the exact same
seq logic for traversing a given mr_table. Refactor that code and make
it common.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 69 
 net/ipv4/ipmr.c | 93 +++
 net/ipv4/ipmr_base.c| 62 +
 net/ipv6/ip6mr.c| 97 -
 4 files changed, 143 insertions(+), 178 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 18a1d75..413f103 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -203,4 +204,72 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void 
*hasharg)
 {
return mr_mfc_find_parent(mrt, hasharg, -1);
 }
+
+#ifdef CONFIG_PROC_FS
+struct mr_mfc_iter {
+   struct seq_net_private p;
+   struct mr_table *mrt;
+   struct list_head *cache;
+
+   /* Lock protecting the mr_table's unresolved queue */
+   spinlock_t *lock;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_seq_idx(struct net *net,
+struct mr_mfc_iter *it, loff_t pos);
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos);
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   struct mr_mfc_iter *it = seq->private;
+
+   it->mrt = mrt;
+   it->cache = NULL;
+   it->lock = lock;
+
+   return *pos ? mr_mfc_seq_idx(seq_file_net(seq),
+seq->private, *pos - 1)
+   : SEQ_START_TOKEN;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+   struct mr_mfc_iter *it = seq->private;
+   struct mr_table *mrt = it->mrt;
+
+   if (it->cache == >mfc_unres_queue)
+   spin_unlock_bh(it->lock);
+   else if (it->cache == >mfc_cache_list)
+   rcu_read_unlock();
+}
+#else
+static inline void *mr_mfc_seq_idx(struct net *net,
+  struct mr_mfc_iter *it, loff_t pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+   loff_t *pos)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
+struct mr_table *mrt, spinlock_t *lock)
+{
+   return NULL;
+}
+
+static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+#endif
+#endif
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 0df4dd5..0281f89 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3014,41 +3014,8 @@ static const struct file_operations ipmr_vif_fops = {
.release = seq_release_net,
 };
 
-struct ipmr_mfc_iter {
-   struct seq_net_private p;
-   struct mr_table *mrt;
-   struct list_head *cache;
-};
-
-static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
- struct ipmr_mfc_iter *it, loff_t pos)
-{
-   struct mr_table *mrt = it->mrt;
-   struct mr_mfc *mfc;
-
-   rcu_read_lock();
-   it->cache = >mfc_cache_list;
-   list_for_each_entry_rcu(mfc, >mfc_cache_list, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-   rcu_read_unlock();
-
-   spin_lock_bh(_unres_lock);
-   it->cache = >mfc_unres_queue;
-   list_for_each_entry(mfc, it->cache, list)
-   if (pos-- == 0)
-   return (struct mfc_cache *)mfc;
-
-   spin_unlock_bh(_unres_lock);
-
-   it->cache = NULL;
-   return NULL;
-}
-
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-   struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
 
@@ -3056,57 +3023,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
loff_t *pos)
if (!mrt)
return ERR_PTR(-ENOENT);
 
-   it->mrt = mrt;
-   it->cache = NULL;
-   return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
-   : SEQ_START_TOKEN;
-}
-
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct ipmr_mfc_iter *it = seq->private;
-   struct net *net = seq_file_net(seq);
-   struct mr_table *mrt = it->mrt;
-   struct mfc_cache *mfc = v;
-
-   ++*pos;
-
-   if (v == SEQ_START_TOKEN)
-   return ipmr_mfc_seq_idx(net, seq->private, 0);
-
-   if (mfc->_c.list.n

[RFC net-next 01/11] ipmr,ipmr6: Define a uniform vif_device

2018-02-20 Thread Yuval Mintz

The two implementations have almost identical structures - vif_device and
mif_device. As a step toward uniforming the mr_tables, eliminate the
mif_device and relocate the vif_device definition into a new common
header file.

Also, introduce a common initializing function for setting most of the
vif_device fields in a new common source file. This requires modifying
the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common
config option - CONFIG_IP_MROUTE_COMMON.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute.h  | 13 +---
 include/linux/mroute6.h | 11 +-
 include/linux/mroute_base.h | 52 +
 net/ipv4/Kconfig|  5 +
 net/ipv4/Makefile   |  1 +
 net/ipv4/ipmr.c | 32 +---
 net/ipv4/ipmr_base.c| 28 
 net/ipv6/Kconfig|  1 +
 net/ipv6/ip6mr.c| 37 
 9 files changed, 117 insertions(+), 63 deletions(-)
 create mode 100644 include/linux/mroute_base.h
 create mode 100644 net/ipv4/ipmr_base.c

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5396521..b8aadff 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IP_MROUTE
 static inline int ip_mroute_opt(int opt)
@@ -56,18 +57,6 @@ static inline bool ipmr_rule_default(const struct fib_rule 
*rule)
 }
 #endif
 
-struct vif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   struct netdev_phys_item_id dev_parent_id;   /* Device parent ID
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   __be32  local,remote;   /* Addresses(remote for 
tunnels)*/
-   int link;   /* Physical interface index 
*/
-};
-
 struct vif_entry_notifier_info {
struct fib_notifier_info info;
struct net_device *dev;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 3014c52..e5e5b82 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -7,6 +7,7 @@
 #include   /* for struct sk_buff_head */
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -62,16 +63,6 @@ static inline void ip6_mr_cleanup(void)
 }
 #endif
 
-struct mif_device {
-   struct net_device   *dev;   /* Device we are using 
*/
-   unsigned long   bytes_in,bytes_out;
-   unsigned long   pkt_in,pkt_out; /* Statistics   
*/
-   unsigned long   rate_limit; /* Traffic shaping (NI) 
*/
-   unsigned char   threshold;  /* TTL threshold
*/
-   unsigned short  flags;  /* Control flags
*/
-   int link;   /* Physical interface index 
*/
-};
-
 #define VIFF_STATIC 0x8000
 
 struct mfc6_cache {
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
new file mode 100644
index 000..0de651e
--- /dev/null
+++ b/include/linux/mroute_base.h
@@ -0,0 +1,52 @@
+#ifndef __LINUX_MROUTE_BASE_H
+#define __LINUX_MROUTE_BASE_H
+
+#include 
+
+/**
+ * struct vif_device - interface representor for multicast routing
+ * @dev: network device being used
+ * @bytes_in: statistic; bytes ingressing
+ * @bytes_out: statistic; bytes egresing
+ * @pkt_in: statistic; packets ingressing
+ * @pkt_out: statistic; packets egressing
+ * @rate_limit: Traffic shaping (NI)
+ * @threshold: TTL threshold
+ * @flags: Control flags
+ * @link: Physical interface index
+ * @dev_parent_id: device parent id
+ * @local: Local address
+ * @remote: Remote address for tunnels
+ */
+struct vif_device {
+   struct net_device *dev;
+   unsigned long bytes_in, bytes_out;
+   unsigned long pkt_in, pkt_out;
+   unsigned long rate_limit;
+   unsigned char threshold;
+   unsigned short flags;
+   int link;
+
+   /* Currently only used by ipmr */
+   struct netdev_phys_item_id dev_parent_id;
+   __be32 local, remote;
+};
+
+#ifdef CONFIG_IP_MROUTE_COMMON
+void vif_device_init(struct vif_device *v,
+struct net_device *dev,
+unsigned long rate_limit,
+unsigned char threshold,
+unsigned short flags,
+unsigned short get_iflink_mask);
+#else
+static inline void vif_device_init(struct vif_device *v,
+  struct net_devic

[RFC net-next 02/11] ip6mr: Make mroute_sk rcu-based

2018-02-20 Thread Yuval Mintz

In ipmr the mr_table socket is handled under RCU. Introduce the same
for ip6mr.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute6.h |  6 +++---
 net/ipv6/ip6_output.c   |  2 +-
 net/ipv6/ip6mr.c| 43 ++-
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index e5e5b82..e1b9fb0 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -111,12 +111,12 @@ extern int ip6mr_get_route(struct net *net, struct 
sk_buff *skb,
   struct rtmsg *rtm, u32 portid);
 
 #ifdef CONFIG_IPV6_MROUTE
-extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb);
+bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
 extern int ip6mr_sk_done(struct sock *sk);
 #else
-static inline struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 {
-   return NULL;
+   return false;
 }
 static inline int ip6mr_sk_done(struct sock *sk)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 997c7f1..a6eb0e6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
-   ((mroute6_socket(net, skb) &&
+   ((mroute6_is_socket(net, skb) &&
 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 ipv6_chk_mcast_addr(dev, _hdr(skb)->daddr,
 _hdr(skb)->saddr))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e397990..7792fc5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -58,7 +58,7 @@ struct mr6_table {
struct list_headlist;
possible_net_t  net;
u32 id;
-   struct sock *mroute6_sk;
+   struct sock __rcu   *mroute6_sk;
struct timer_list   ipmr_expire_timer;
struct list_headmfc6_unres_queue;
struct list_headmfc6_cache_array[MFC6_LINES];
@@ -1121,6 +1121,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr6_table *mrt,
 static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert)
 {
+   struct sock *mroute6_sk;
struct sk_buff *skb;
struct mrt6msg *msg;
int ret;
@@ -1190,17 +1191,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, 
struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
 
-   if (!mrt->mroute6_sk) {
+   rcu_read_lock();
+   mroute6_sk = rcu_dereference(mrt->mroute6_sk);
+   if (!mroute6_sk) {
+   rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
 
mrt6msg_netlink_event(mrt, skb);
 
-   /*
-*  Deliver to user space multicast routing algorithms
-*/
+   /* Deliver to user space multicast routing algorithms */
ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+   rcu_read_unlock();
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping 
entries\n");
kfree_skb(skb);
@@ -1584,11 +1587,11 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct 
sock *sk)
 
rtnl_lock();
write_lock_bh(_lock);
-   if (likely(mrt->mroute6_sk == NULL)) {
-   mrt->mroute6_sk = sk;
-   net->ipv6.devconf_all->mc_forwarding++;
-   } else {
+   if (rtnl_dereference(mrt->mroute6_sk)) {
err = -EADDRINUSE;
+   } else {
+   rcu_assign_pointer(mrt->mroute6_sk, sk);
+   net->ipv6.devconf_all->mc_forwarding++;
}
write_unlock_bh(_lock);
 
@@ -1614,9 +1617,9 @@ int ip6mr_sk_done(struct sock *sk)
 
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
-   if (sk == mrt->mroute6_sk) {
+   if (sk == rtnl_dereference(mrt->mroute6_sk)) {
write_lock_bh(_lock);
-   mrt->mroute6_sk = NULL;
+   RCU_INIT_POINTER(mrt->mroute6_sk, NULL);
net->ipv6.devconf_all->mc_forwarding--;
write_unlock_bh(_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1630,11 +1633,12 @@ int ip6mr_sk_done(struct sock *sk)
}
}
rtnl_unlock();
+   synchronize_rcu();
 
return err;
 }
 
-struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+bool mroute6_is_so

[RFC net-next 07/11] ipmr, ip6mr: Unite logic for searching in MFC cache

2018-02-20 Thread Yuval Mintz

ipmr and ip6mr utilize the exact same methods for searching the
hashed resolved connections, difference being only in the construction
of the hash comparison key.

In order to unite the flow, introduce an mr_table operation set that
would contain the protocol specific information required for common
flows, in this case - the hash parameters and a comparison key
representing a (*,*) route.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/linux/mroute_base.h | 52 +--
 net/ipv4/ipmr.c | 71 ++-
 net/ipv4/ipmr_base.c| 54 +++--
 net/ipv6/ip6mr.c| 74 +++--
 4 files changed, 134 insertions(+), 117 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 2769e2f..18a1d75 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -89,10 +89,23 @@ struct mr_mfc {
struct rcu_head rcu;
 };
 
+struct mr_table;
+
+/**
+ * struct mr_table_ops - callbacks and info for protocol-specific ops
+ * rht_params: parameters for accessing the MFC hash
+ * cmparg_any: a hash key to be used for matching on (*,*) routes
+ */
+struct mr_table_ops {
+   const struct rhashtable_params *rht_params;
+   void *cmparg_any;
+};
+
 /**
  * struct mr_table - a multicast routing table
  * @list: entry within a list of multicast routing tables
  * @net: net where this table belongs
+ * @op: protocol specific operations
  * @id: identifier of the table
  * @mroute_sk: socket associated with the table
  * @ipmr_expire_timer: timer for handling unresolved routes
@@ -109,6 +122,7 @@ struct mr_mfc {
 struct mr_table {
struct list_headlist;
possible_net_t  net;
+   struct mr_table_ops ops;
u32 id;
struct sock __rcu   *mroute_sk;
struct timer_list   ipmr_expire_timer;
@@ -133,10 +147,19 @@ void vif_device_init(struct vif_device *v,
 
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net));
+
+/* These actually return 'struct mr_mfc *', but to avoid need for explicit
+ * castings they simply return void.
+ */
+void *mr_mfc_find_parent(struct mr_table *mrt,
+void *hasharg, int parent);
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
+
 #else
 static inline void vif_device_init(struct vif_device *v,
   struct net_device *dev,
@@ -147,14 +170,37 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline struct mr_table *
+static inline void *
 mr_table_alloc(struct net *net, u32 id,
-  const struct rhashtable_params *rht_params,
+  struct mr_table_ops *ops,
   void (*expire_func)(struct timer_list *t),
   void (*table_set)(struct mr_table *mrt,
 struct net *net))
 {
return NULL;
 }
+
+static inline void *mr_mfc_find_parent(struct mr_table *mrt,
+  void *hasharg, int parent)
+{
+   return NULL;
+}
+
+static inline void *mr_mfc_find_any_parent(struct mr_table *mrt,
+  int vifi)
+{
+   return NULL;
+}
+
+static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt,
+int vifi, void *hasharg)
+{
+   return NULL;
+}
 #endif
+
+static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
+{
+   return mr_mfc_find_parent(mrt, hasharg, -1);
+}
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a47d061..0df4dd5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -359,6 +359,16 @@ static void ipmr_new_table_set(struct mr_table *mr,
 #endif
 }
 
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+   .mfc_mcastgrp = htonl(INADDR_ANY),
+   .mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+   .rht_params = _rht_params,
+   .cmparg_any = _mr_table_ops_cmparg_any,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
struct mr_table *mrt;
@@ -371,7 +381,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 
id)
if (mrt)
return mrt;
 
-   return mr_table_alloc(net, id, _rht_params,
+   return mr_table_alloc(net, id, _mr_table_ops,
  ipmr_expire_process, ipmr_new_table_set);
 }
 
@@ -972,33 +982,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table 
*mrt,
.mfc_mc

RE: [patch net-next 5/5] mlxsw: spectrum: qdiscs: Support stats for PRIO qdisc

2018-01-12 Thread Yuval Mintz

> > > Hm.  You you need this just because you didn't add the backlog
> > > pointer to destroy?  AFAIK on destroy we are free to reset stats as
> > > well, thus simplifying your driver...  Let me know if I
> > > misunderstand.
> >
> > This is meant exactly for the scenario where qdisc didn't get
> > destroyed yet is no longer offloaded; E.g., if number of bands
> > increased beyond What we can offload. So we can't reset the
> > statistics in this case. [Although I might be the one to
> > misunderstand you, as the 'not destroyed' was explicitly mentioned
> > twice above]
> 
> I was trying to take some liberty with handling of destroy but your
> approach may actually end up being simpler.  I will withdraw my series
> for now and reuse your new callback once this series lands.
> 
> Do you have any objections to changing RED to behave more like prio
> (and other qdiscs) in principle?

>From statistics' perspective? None.

RE: [patch net-next 5/5] mlxsw: spectrum: qdiscs: Support stats for PRIO qdisc

2018-01-11 Thread Yuval Mintz

> > Support basic stats for PRIO qdisc, which includes tx packets and bytes
> > count, drops count and backlog size. The rest of the stats are irrelevant
> > for this qdisc offload.
> > Since backlog is not only incremental but reflecting momentary value, in
> > case of a qdisc that stops being offloaded but is not destroyed, backlog
> > value needs to be updated about the un-offloading.
> > For that reason an unoffload function is being added to the ops struct.
> >
> > Signed-off-by: Nogah Frankel <nog...@mellanox.com>
> > Reviewed-by: Yuval Mintz <yuv...@mellanox.com>
> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> > ---
> >  .../net/ethernet/mellanox/mlxsw/spectrum_qdisc.c   | 92
> ++
> >  1 file changed, 92 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
> b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
> > index 9e83edde7b35..272c04951e5d 100644
> > --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
> > +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
> > @@ -66,6 +66,11 @@ struct mlxsw_sp_qdisc_ops {
> >   void *xstats_ptr);
> > void (*clean_stats)(struct mlxsw_sp_port *mlxsw_sp_port,
> > struct mlxsw_sp_qdisc *mlxsw_sp_qdisc);
> > +   /* unoffload - to be used for a qdisc that stops being offloaded
> without
> > +* being destroyed.
> > +*/
> > +   void (*unoffload)(struct mlxsw_sp_port *mlxsw_sp_port,
> > + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, void
> *params);
> 
> Hm.  You you need this just because you didn't add the backlog pointer
> to destroy?  AFAIK on destroy we are free to reset stats as well, thus
> simplifying your driver...  Let me know if I misunderstand.

This is meant exactly for the scenario where qdisc didn't get destroyed
yet is no longer offloaded; E.g., if number of bands increased beyond
What we can offload. So we can't reset the statistics in this case.
[Although I might be the one to misunderstand you,
as the 'not destroyed' was explicitly mentioned twice above]

> 
> >  };
> >
> >  struct mlxsw_sp_qdisc {
> > @@ -73,6 +78,9 @@ struct mlxsw_sp_qdisc {
> > u8 tclass_num;
> > union {
> > struct red_stats red;
> > +   struct mlxsw_sp_qdisc_prio_stats {
> > +   u64 backlog;
> 
> This is not a prio stat, it's a standard qstat.  I've added it to
> struct mlxsw_sp_qdisc_stats.  The reason you need to treat it
> separately is that RED has non-standard backlog handling which I'm
> trying to fix...
> 
> > +   } prio;
> > } xstats_base;
> > struct mlxsw_sp_qdisc_stats {
> > u64 tx_bytes;
> > @@ -144,6 +152,9 @@ mlxsw_sp_qdisc_replace(struct mlxsw_sp_port
> *mlxsw_sp_port, u32 handle,
> >
> >  err_bad_param:
> >  err_config:
> > +   if (mlxsw_sp_qdisc->handle == handle && ops->unoffload)
> > +   ops->unoffload(mlxsw_sp_port, mlxsw_sp_qdisc, params);
> > +
> > mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
> > return err;
> >  }
> 
> > @@ -479,6 +567,10 @@ int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port
> *mlxsw_sp_port,
> > switch (p->command) {
> > case TC_PRIO_DESTROY:
> > return mlxsw_sp_qdisc_destroy(mlxsw_sp_port,
> mlxsw_sp_qdisc);
> > +   case TC_PRIO_STATS:
> > +   return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port,
> mlxsw_sp_qdisc,
> > +   >stats);
> > +
> 
> nit: extra new line intentional? :)
> 
> > default:
> > return -EOPNOTSUPP;
> > }

RE: [patch net-next 3/5] net: sch: prio: Add offload ability to PRIO qdisc

2018-01-11 Thread Yuval Mintz

> > > > +struct tc_prio_qopt_offload_params {
> > > > +   int bands;
> > > > +   u8 priomap[TC_PRIO_MAX + 1];
> > > > +   /* In case that a prio qdisc is offloaded and now is changed to 
> > > > a
> > > > +* non-offloadedable config, it needs to update the backlog 
> > > > value
> > > > +* to negate the HW backlog value.
> > > > +*/
> > > > +   u32 *backlog;
> > > > +};
> > >
> > > Could we please pass the full qstats on replace and destroy.  This
> > > simplifies the driver code and allows handling the qlen as well as
> > > backlog.  Please see the 2 patch series I sent earlier yesterday.
> >
> > That might give the false impression that offloading driver is expected
> > to correct all the qstats fields during destruction, whereas for most of
> > them it doesn't seem appropriate.
> 
> The driver is supposed to return the momentary stats to their
> original/SW-only value.  And the driver knows exactly which stats
> those are, just look at your patch 5, you handle backlog completely
> differently than other stats already.

*we* surely understand that now. I'm just mentioning it might
confuse future offloaders; No strong objection here.
And I agree the alternative [passing pointers to each momentary stat]
is quite ugly.

RE: [patch net-next 3/5] net: sch: prio: Add offload ability to PRIO qdisc

2018-01-11 Thread Yuval Mintz

> > +struct tc_prio_qopt_offload_params {
> > +   int bands;
> > +   u8 priomap[TC_PRIO_MAX + 1];
> > +   /* In case that a prio qdisc is offloaded and now is changed to a
> > +* non-offloadedable config, it needs to update the backlog value
> > +* to negate the HW backlog value.
> > +*/
> > +   u32 *backlog;
> > +};
> 
> Could we please pass the full qstats on replace and destroy.  This
> simplifies the driver code and allows handling the qlen as well as
> backlog.  Please see the 2 patch series I sent earlier yesterday.

That might give the false impression that offloading driver is expected
to correct all the qstats fields during destruction, whereas for most of
them it doesn't seem appropriate.

RE: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-28 Thread Yuval Mintz

> >>> Many of the ASIC's internal resources are limited and are shared
> between
> >>> several hardware procedures. For example, unified hash-based memory
> can
> >>> be used for many lookup purposes, like FDB and LPM. In many cases the
> user
> >>> can provide a partitioning scheme for such a resource in order to
> perform
> >>> fine tuning for his application. In such cases performing driver reload is
> >>> needed for the changes to take place, thus this patchset also adds support
> >>> for hot reload.
> >>>
> >>> Such an abstraction can be coupled with devlink's dpipe interface, which
> >>> models the ASIC's pipeline as a graph of match/action tables. By
> modeling
> >>> the hardware resource object, and by coupling it to several dpipe tables,
> >>> further visibility can be achieved in order to debug ASIC-wide issues.
> >>>
> >>> The proposed interface will provide the user the ability to understand the
> >>> limitations of the hardware, and receive notification regarding its
> occupancy.
> >>> Furthermore, monitoring the resource occupancy can be done in real-
> time and
> >>> can be useful in many cases.
> >>
> >> In the last RFC (not v1, but RFC) I asked for some kind of description
> >> for each resource, and you and Arkadi have pushed back. Let's walk
> >> through an example to see what I mean:
> >>
> >> $ devlink resource show pci/:03:00.0
> >> pci/:03:00.0:
> >>  name kvd size 245760 size_valid true
> >>  resources:
> >>name linear size 98304 occ 0
> >>name hash_double size 60416
> >>name hash_single size 87040
> >>
> >> So this 2700 has 3 resources that can be managed -- some table or
> >> resource or something named 'kvd' with linear, hash_double and
> >> hash_single sub-resources. What are these names referring too? The
> above
> >> output gives no description, and 'kvd' is not an industry term. Further,
> >
> > This are internal resources specific to the ASIC. Would you like some
> > description to each or something like that?
> 
> devlink has some nice self-documenting capabilities. What's missing here
> is a description of what the resource is used for in standard terms --
> ipv4 host routes, fdb, nexthops, rifs, etc. Even if the description is a
> short list versus an exhaustive list of everything it is used for. e.g.,
> Why would a user decrease linear and increase hash_single or vice versa?
> 
> >
> >
> >> what are these sizes that a user can control? The output contains no
> >> units, no description, nothing. In short, the above output provides
> >> random numbers associated with random names.
> >
> > Units are now exposed from kernel, just this version of iproute2 patch
> > does not display it.
> 
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
> 
> >
> >
> >>
> >> I can see dpipe tables exported by this device:
> >>
> >> $ devlink dpipe header show pci/:03:00.0
> >>
> >> pci/:03:00.0:
> >>  name mlxsw_meta
> >>  field:
> >>name erif_port bitwidth 32 mapping_type ifindex
> >>name l3_forward bitwidth 1
> >>name l3_drop bitwidth 1
> >>name adj_index bitwidth 32
> >>name adj_size bitwidth 32
> >>name adj_hash_index bitwidth 32
> >>
> >>  name ipv6
> >>  field:
> >>name destination ip bitwidth 128
> >>
> >>  name ipv4
> >>  field:
> >>name destination ip bitwidth 32
> >>
> >>  name ethernet
> >>  field:
> >>name destination mac bitwidth 48
> >>
> >> but none mention 'kvd' or 'linear' or 'hash" and none of the other
> >> various devlink options:
> >>
> >> $ devlink
> >> Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }
> >> where  OBJECT := { dev | port | sb | monitor | dpipe }
> >>
> >> seem to related to resources.
> >>
> >> So how does a user know what they are controlling by this 'resource'
> >> option? Is the user expected to have a PRM or user guide on hand for the
> >> specific device model that is being configured?
> >
> > The relation of specific dpipe table to specific resource is exposed by
> > the kernel as well. Probably the iproute2 patch just does not display
> > it.
> 
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
> 
> >
> >
> >>
> >> Again, I have no objections to kvd, linear, hash, etc terms as they do
> >> relate to Mellanox products. But kvd/linear, for example, does correlate
> >> to industry standard concepts in some way. My request is that the
> >> resource listing guide the user in some way, stating what these
> >> resources mean.
> >
> > So the showed relation to dpipe table would be enougn or you would still
> > like to see some description? I don't like the description concept here
> > as the relations to dpipe table should tell user exactly what he needs
> > to know.
> 
> I believe it is useful to have a 1-line, short description that gives
> the user some memory jogger as to what the resource is used for. It does
> not have to be an

RE: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-28 Thread Yuval Mintz

>  Many of the ASIC's internal resources are limited and are shared
> between
>  several hardware procedures. For example, unified hash-based memory
> can
>  be used for many lookup purposes, like FDB and LPM. In many cases the
> user
>  can provide a partitioning scheme for such a resource in order to
> perform
>  fine tuning for his application. In such cases performing driver reload 
>  is
>  needed for the changes to take place, thus this patchset also adds
> support
>  for hot reload.
> 
>  Such an abstraction can be coupled with devlink's dpipe interface, which
>  models the ASIC's pipeline as a graph of match/action tables. By
> modeling
>  the hardware resource object, and by coupling it to several dpipe tables,
>  further visibility can be achieved in order to debug ASIC-wide issues.
> 
>  The proposed interface will provide the user the ability to understand
> the
>  limitations of the hardware, and receive notification regarding its
> occupancy.
>  Furthermore, monitoring the resource occupancy can be done in real-
> time and
>  can be useful in many cases.
> >>>
> >>> In the last RFC (not v1, but RFC) I asked for some kind of description
> >>> for each resource, and you and Arkadi have pushed back. Let's walk
> >>> through an example to see what I mean:
> >>>
> >>> $ devlink resource show pci/:03:00.0
> >>> pci/:03:00.0:
> >>>  name kvd size 245760 size_valid true
> >>>  resources:
> >>>name linear size 98304 occ 0
> >>>name hash_double size 60416
> >>>name hash_single size 87040
> >>>
> >>> So this 2700 has 3 resources that can be managed -- some table or
> >>> resource or something named 'kvd' with linear, hash_double and
> >>> hash_single sub-resources. What are these names referring too? The
> above
> >>> output gives no description, and 'kvd' is not an industry term. Further,
> >>
> >> This are internal resources specific to the ASIC. Would you like some
> >> description to each or something like that?
> >
> > devlink has some nice self-documenting capabilities. What's missing here
> > is a description of what the resource is used for in standard terms --
> > ipv4 host routes, fdb, nexthops, rifs, etc. Even if the description is a
> > short list versus an exhaustive list of everything it is used for. e.g.,
> > Why would a user decrease linear and increase hash_single or vice versa?
> 
> 
> Arkadi, on what david says above, can the resource names and ids not
> be driver specific, but moved up to the switchdev layer and just map
> to fdb, host routes, nexthops table sizes etc ?.
> Can these generic networking resources then in-turn be mapped to kvd
> sizes by the driver ?

I think it goes the other way around. The dpipe tables are the ones that
can be translated to functionality; The resources are internal and HW-specific
representing the possible internal division of resources -
but a given resource sn't necessarily mapped to a single networking feature.
[It might be in some cases, but not in the general case]

You could always move to a structured approach where each resource
in the hierarchy is further split to sub-resources until each leaf represents
a single network concepts - but that would stop be an abstraction of the
HW resources and become a SW implementation instead, as SW would
have to be the one to maintain and enforce the resource distribution.
And that's not what we're trying to achieve here.

RE: [patch net-next v2 00/10] Add support for resource abstraction

2017-12-27 Thread Yuval Mintz

> >> Many of the ASIC's internal resources are limited and are shared between
> >> several hardware procedures. For example, unified hash-based memory
> can
> >> be used for many lookup purposes, like FDB and LPM. In many cases the
> user
> >> can provide a partitioning scheme for such a resource in order to perform
> >> fine tuning for his application. In such cases performing driver reload is
> >> needed for the changes to take place, thus this patchset also adds support
> >> for hot reload.
> >>
> >> Such an abstraction can be coupled with devlink's dpipe interface, which
> >> models the ASIC's pipeline as a graph of match/action tables. By modeling
> >> the hardware resource object, and by coupling it to several dpipe tables,
> >> further visibility can be achieved in order to debug ASIC-wide issues.
> >>
> >> The proposed interface will provide the user the ability to understand the
> >> limitations of the hardware, and receive notification regarding its
> occupancy.
> >> Furthermore, monitoring the resource occupancy can be done in real-time
> and
> >> can be useful in many cases.
> >
> >In the last RFC (not v1, but RFC) I asked for some kind of description
> >for each resource, and you and Arkadi have pushed back. Let's walk
> >through an example to see what I mean:
> >
> >$ devlink resource show pci/:03:00.0
> >pci/:03:00.0:
> >  name kvd size 245760 size_valid true
> >  resources:
> >name linear size 98304 occ 0
> >name hash_double size 60416
> >name hash_single size 87040
> >
> >So this 2700 has 3 resources that can be managed -- some table or
> >resource or something named 'kvd' with linear, hash_double and
> >hash_single sub-resources. What are these names referring too? The above
> >output gives no description, and 'kvd' is not an industry term. Further,
> 
> This are internal resources specific to the ASIC. Would you like some
> description to each or something like that?
> 
> 
> >what are these sizes that a user can control? The output contains no
> >units, no description, nothing. In short, the above output provides
> >random numbers associated with random names.
> 
> Units are now exposed from kernel, just this version of iproute2 patch
> does not display it.
> 
> 
> >
> >I can see dpipe tables exported by this device:
> >
> >$ devlink dpipe header show pci/:03:00.0
> >
> >pci/:03:00.0:
> >  name mlxsw_meta
> >  field:
> >name erif_port bitwidth 32 mapping_type ifindex
> >name l3_forward bitwidth 1
> >name l3_drop bitwidth 1
> >name adj_index bitwidth 32
> >name adj_size bitwidth 32
> >name adj_hash_index bitwidth 32
> >
> >  name ipv6
> >  field:
> >name destination ip bitwidth 128
> >
> >  name ipv4
> >  field:
> >name destination ip bitwidth 32
> >
> >  name ethernet
> >  field:
> >name destination mac bitwidth 48
> >
> >but none mention 'kvd' or 'linear' or 'hash" and none of the other
> >various devlink options:
> >
> >$ devlink
> >Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }
> >where  OBJECT := { dev | port | sb | monitor | dpipe }
> >
> >seem to related to resources.
> >
> >So how does a user know what they are controlling by this 'resource'
> >option? Is the user expected to have a PRM or user guide on hand for the
> >specific device model that is being configured?
> 
> The relation of specific dpipe table to specific resource is exposed by
> the kernel as well. Probably the iproute2 patch just does not display
> it.

It does, just under 'table' and not under 'header'. E.g.:

# ./devlink/devlink dpipe -j -p table show pci/:03:00.0
...
},{
"resource_path": "/kvd/hash_single",
"name": "mlxsw_host4",
"size": 0,
"counters_enabled": "false",
"match": [{
"type": "field_exact",
"header": "mlxsw_meta",
"field": "erif_port",
"mapping": "ifindex"
},{
"type": "field_exact",
"header": "ipv4",
"field": "destination ip"
}
],
"action": [{
"type": "field_modify",
"header": "ethernet",
"field": "destination mac"
}
]
},{
...

> 
> 
> >
> >Again, I have no objections to kvd, linear, hash, etc terms as they do
> >relate to Mellanox products. But kvd/linear, for example, does correlate
> >to industry standard concepts in some way. My request is that the
> >resource listing guide the user in some way, stating what these
> >resources mean.
> 
> So the showed relation to dpipe table would be enougn or you would still
> like to see some description? I don't like the description concept here
> as the relations to dpipe table should tell user exactly what he needs
> to know.
> 
> 
> >
>

RE: [PATCH iproute] qdisc: Print offload indication

2017-12-26 Thread Yuval Mintz

> >Use the newly added TCA_HW_OFFLOAD indication from kernel
> >to print a consistent 'offloaded' message to user when listing qdiscs.
> >
> >Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
> 
> Reviewed-by: Jiri Pirko <j...@mellanox.com>

Just now saw the Stephen's e-mail regarding iproute's net-next branch.
In this case the new uapi needed was pushed to 'net'. Does this mean you're
Going to take it to the master or net-next branch?

If the latter, it doesn't apply cleanly on David's branch since he already
updated the kernel headers in include/uapi/linux/rtnetlink.h.
It's trivial, but tell me if you want a re-spin.

[PATCH iproute] qdisc: Print offload indication

2017-12-26 Thread Yuval Mintz

Use the newly added TCA_HW_OFFLOAD indication from kernel
to print a consistent 'offloaded' message to user when listing qdiscs.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/uapi/linux/rtnetlink.h | 1 +
 tc/tc_qdisc.c  | 4 
 2 files changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index b8335b5..13bf56f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
TCA_PAD,
TCA_DUMP_INVISIBLE,
TCA_CHAIN,
+   TCA_HW_OFFLOAD,
__TCA_MAX
 };
 
diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index 4431d5f..70279b9 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -266,6 +266,10 @@ int print_qdisc(const struct sockaddr_nl *who,
if (t->tcm_info != 1)
print_uint(PRINT_ANY, "refcnt", "refcnt %u ", t->tcm_info);
 
+   if (tb[TCA_HW_OFFLOAD] &&
+   (rta_getattr_u8(tb[TCA_HW_OFFLOAD])))
+   print_bool(PRINT_ANY, "offloaded", "offloaded ", true);
+
/* pfifo_fast is generic enough to warrant the hardcoding --JHS */
if (strcmp("pfifo_fast", RTA_DATA(tb[TCA_KIND])) == 0)
q = get_qdisc_kind("prio");
-- 
2.4.3

[PATCH net 1/3] net: sched: Add TCA_HW_OFFLOAD

2017-12-14 Thread Yuval Mintz

Qdiscs can be offloaded to HW, but current implementation isn't uniform.
Instead, qdiscs either pass information about offload status via their
TCA_OPTIONS or omit it altogether.

Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform
uAPI for the offloading status of qdiscs.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
Do Notice this is going to create [easy-to-solve-]conflicts with net-next,
Due to 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking").
That's also why the numbering here are apparently inconsistent [skipping
0x100].
---

 include/net/sch_generic.h  | 1 +
 include/uapi/linux/rtnetlink.h | 1 +
 net/sched/sch_api.c| 2 ++
 3 files changed, 4 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25..83a3e47 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
  * qdisc_tree_decrease_qlen() should stop.
  */
 #define TCQ_F_INVISIBLE0x80 /* invisible by default in dump */
+#define TCQ_F_OFFLOADED0x200 /* qdisc is offloaded to HW */
u32 limit;
const struct Qdisc_ops  *ops;
struct qdisc_size_table __rcu *stab;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80..843e29a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
TCA_PAD,
TCA_DUMP_INVISIBLE,
TCA_CHAIN,
+   TCA_HW_OFFLOAD,
__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f53..0f1eab9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc 
*q, u32 clid,
tcm->tcm_info = refcount_read(>refcnt);
if (nla_put_string(skb, TCA_KIND, q->ops->id))
goto nla_put_failure;
+   if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+   goto nla_put_failure;
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
qlen = q->q.qlen;
-- 
2.4.3

[PATCH net 3/3] pkt_sched: Remove TC_RED_OFFLOADED from uapi

2017-12-14 Thread Yuval Mintz

Following the previous patch, RED is now using the new uniform uapi
for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no
longer utilized by kernel and can be removed [as it's still not
part of any stable release].

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 include/uapi/linux/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index af3cc2f..37b5096 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,7 +256,6 @@ struct tc_red_qopt {
 #define TC_RED_ECN 1
 #define TC_RED_HARDDROP2
 #define TC_RED_ADAPTATIVE  4
-#define TC_RED_OFFLOADED   8
 };
 
 struct tc_red_xstats {
-- 
2.4.3

[PATCH net 2/3] net: sched: Move to new offload indication in RED

2017-12-14 Thread Yuval Mintz

Let RED utilize the new internal flag, TCQ_F_OFFLOADED,
to mark a given qdisc as offloaded instead of using a dedicated
indication.

Also, change internal logic into looking at said flag when possible.

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 net/sched/sch_red.c | 31 +++
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9d874e6..f0747eb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
.handle = sch->handle,
.parent = sch->parent,
};
+   int err;
 
if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
return -EOPNOTSUPP;
@@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.command = TC_RED_DESTROY;
}
 
-   return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, );
+   err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, );
+
+   if (!err && enable)
+   sch->flags |= TCQ_F_OFFLOADED;
+   else
+   sch->flags &= ~TCQ_F_OFFLOADED;
+
+   return err;
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
return red_change(sch, opt);
 }
 
-static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 {
struct net_device *dev = qdisc_dev(sch);
struct tc_red_qopt_offload hw_stats = {
@@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct 
tc_red_qopt *opt)
.stats.qstats = >qstats,
},
};
-   int err;
 
-   opt->flags &= ~TC_RED_OFFLOADED;
-   if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-   return 0;
-
-   err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-   _stats);
-   if (err == -EOPNOTSUPP)
+   if (!(sch->flags & TCQ_F_OFFLOADED))
return 0;
 
-   if (!err)
-   opt->flags |= TC_RED_OFFLOADED;
-
-   return err;
+   return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+_stats);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
int err;
 
sch->qstats.backlog = q->qdisc->qstats.backlog;
-   err = red_dump_offload(sch, );
+   err = red_dump_offload_stats(sch, );
if (err)
goto nla_put_failure;
 
@@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct 
gnet_dump *d)
.marked = q->stats.prob_mark + q->stats.forced_mark,
};
 
-   if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+   if (sch->flags & TCQ_F_OFFLOADED) {
struct red_stats hw_stats = {0};
struct tc_red_qopt_offload hw_stats_request = {
.command = TC_RED_XSTATS,
-- 
2.4.3

[PATCH net 0/3] net: sched: Make qdisc offload uapi uniform

2017-12-14 Thread Yuval Mintz

Several qdiscs can already be offloaded to hardware, but there's an
inconsistecy in regard to the uapi through which they indicate such
an offload is taking place - indication is passed to the user via
TCA_OPTIONS where each qdisc retains private logic for setting it.

The recent addition of offloading to RED in
602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") caused
the addition of yet another uapi field for this purpose -
TC_RED_OFFLOADED.

For clarity and prevention of bloat in the uapi we want to eliminate
said added uapi, replacing it with a common mechanism that can be used
to reflect offload status of the various qdiscs.

The first patch introduces TCA_HW_OFFLOAD as the generic message meant
for this purpose. The second changes the current RED implementation into
setting the internal bits necessary for passing it, and the third removes
TC_RED_OFFLOADED as its no longer needed.

Dave,

A bit unorthodox as it's not a fix per-se, but it's the last chance
for killing the unneeded uapi and replacing it with something better
before getting stuck with it forever.

Cheers,
Yuval

Yuval Mintz (3):
  net: sched: Add TCA_HW_OFFLOAD
  net: sched: Move to new offload indication in RED
  pkt_sched: Remove TC_RED_OFFLOADED from uapi

 include/net/sch_generic.h  |  1 +
 include/uapi/linux/pkt_sched.h |  1 -
 include/uapi/linux/rtnetlink.h |  1 +
 net/sched/sch_api.c|  2 ++
 net/sched/sch_red.c| 31 +++
 5 files changed, 19 insertions(+), 17 deletions(-)

-- 
2.4.3

RE: [PATCH net-next 1/4] net: Introduce NETIF_F_GRO_HW

2017-12-04 Thread Yuval Mintz

> Introduce NETIF_F_GRO_HW feature flag for NICs that support hardware
> GRO.  With this flag, we can now independently turn on or off hardware
> GRO when GRO is on.  Hardware GRO guarantees that packets can be
> re-segmented by TSO/GSO to reconstruct the original packet stream.
> 
> Cc: Ariel Elior 
> Cc: everest-linux...@cavium.com
> Signed-off-by: Michael Chan 
> ---
>  Documentation/networking/netdev-features.txt |  7 +++
>  include/linux/netdev_features.h  |  5 -
>  net/core/dev.c   | 13 +
>  net/core/ethtool.c   |  1 +
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/networking/netdev-features.txt
> b/Documentation/networking/netdev-features.txt
> index 7413eb0..d76d332 100644
> --- a/Documentation/networking/netdev-features.txt
> +++ b/Documentation/networking/netdev-features.txt
> @@ -163,3 +163,10 @@ This requests that the NIC receive all possible
> frames, including errored
>  frames (such as bad FCS, etc).  This can be helpful when sniffing a link with
>  bad packets on it.  Some NICs may receive more packets if also put into
> normal
>  PROMISC mode.
> +
> +*  rx-gro-hw
> +
> +This requests that the NIC enables Hardware GRO (generic receive offload).
> +Hardware GRO is basically the exact reverse of TSO, and is generally
> +stricter than Hardware LRO.  A packet stream merged by Hardware GRO
> must
> +be re-segmentable by GSO or TSO back to the exact packet stream.
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index dc8b489..d18ef6f 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -77,6 +77,8 @@ enum {
>   NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum
> offload */
>   NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP
> tunnels */
> 
> + NETIF_F_GRO_HW_BIT, /* Hardware Generic receive
> offload */
> +
>   /*
>* Add your fresh new feature above and remember to update
>* netdev_features_strings[] in net/core/ethtool.c and maybe
> @@ -96,6 +98,7 @@ enum {
>  #define NETIF_F_FRAGLIST __NETIF_F(FRAGLIST)
>  #define NETIF_F_FSO  __NETIF_F(FSO)
>  #define NETIF_F_GRO  __NETIF_F(GRO)
> +#define NETIF_F_GRO_HW   __NETIF_F(GRO_HW)
>  #define NETIF_F_GSO  __NETIF_F(GSO)
>  #define NETIF_F_GSO_ROBUST   __NETIF_F(GSO_ROBUST)
>  #define NETIF_F_HIGHDMA  __NETIF_F(HIGHDMA)
> @@ -193,7 +196,7 @@ enum {
>   * If upper/master device has these features disabled, they must be disabled
>   * on all lower/slave devices as well.
>   */
> -#define NETIF_F_UPPER_DISABLES   NETIF_F_LRO
> +#define NETIF_F_UPPER_DISABLES   (NETIF_F_LRO | NETIF_F_GRO_HW)
> 
>  /* changeable features with no special hardware requirements */
>  #define NETIF_F_SOFT_FEATURES(NETIF_F_GSO | NETIF_F_GRO)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 30b5fe3..09c2ad0 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7392,6 +7392,19 @@ static netdev_features_t
> netdev_fix_features(struct net_device *dev,
>   features &= ~dev->gso_partial_features;
>   }
> 
> + if (features & NETIF_F_GRO_HW) {
> + /* Hardware GRO depends on GRO. */
> + if (!(features & NETIF_F_GRO)) {

While at it, perhaps also make it dependent on NETIF_F_RXCSUM?

> + netdev_dbg(dev, "Dropping NETIF_F_GSO_HW since
> no GRO feature.\n");
> + features &= ~NETIF_F_GRO_HW;
> + }
> + /* Hardware GRO and LRO are mutually exclusive. */
> + if (features & NETIF_F_LRO) {
> + netdev_dbg(dev, "Dropping NETIF_F_LRO since
> GRO_HW is set.\n");
> + features &= ~NETIF_F_LRO;

Isn't this considered to be breaking an existing API?
After this, while NETIF_F_GRO_HW is published an application trying to
set NETIF_F_LRO and then query its state would discover it failed
[while previously it could have succeeded, such as for bnx2]
 
While I understand the need to make sure core doesn't enable
two competing aggregation offloads, why make GRO_HW > LRO?
I understand it's probably the better one, but until LRO gets deprecated
isn't it safer to do this limitation the opposite way?
I.e., make sure NETIF_F_GRO_HW can't be set as long as NETIF_F_LRO is set?

> + }
> + }
> +
>   return features;
>  }
> 
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index f8fcf45..50a7920 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -73,6 +73,7 @@ int ethtool_op_get_ts_info(struct net_device *dev,
> struct ethtool_ts_info *info)
>   [NETIF_F_LLTX_BIT] = "tx-lockless",
>   [NETIF_F_NETNS_LOCAL_BIT] =  "netns-local",
>   [NETIF_F_GRO_BIT] =  "rx-gro",
> + [NETIF_F_GRO_HW_BIT] =

RE: [PATCH net-next 4/4] qede: Use NETIF_F_GRO_HW.

2017-12-04 Thread Yuval Mintz

> Advertise NETIF_F_GRO_HW and turn on or off hardware GRO based on
> NETIF_F_GRO_FW flag.
> 
> Cc: Ariel Elior 
> Cc: everest-linux...@cavium.com
> Signed-off-by: Michael Chan 
> ---
>  drivers/net/ethernet/qlogic/qede/qede_filter.c | 9 ++---
>  drivers/net/ethernet/qlogic/qede/qede_main.c   | 4 ++--
>  2 files changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c
> b/drivers/net/ethernet/qlogic/qede/qede_filter.c
> index c1a0708..7ee49b4 100644
> --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
> +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
> @@ -901,13 +901,8 @@ int qede_set_features(struct net_device *dev,
> netdev_features_t features)
>   netdev_features_t changes = features ^ dev->features;
>   bool need_reload = false;
> 
> - /* No action needed if hardware GRO is disabled during driver load */
> - if (changes & NETIF_F_GRO) {
> - if (dev->features & NETIF_F_GRO)
> - need_reload = !edev->gro_disable;
> - else
> - need_reload = edev->gro_disable;
> - }
> + if (changes & NETIF_F_GRO_HW)
> + need_reload = true;

This doesn't look right; edev->gro_disable can change due to other
conditions as well - otherwise, it would have been synonymous with 
(dev->features & NETIF_F_GRO).

> 
>   if (need_reload) {
>   struct qede_reload_args args;
> diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c
> b/drivers/net/ethernet/qlogic/qede/qede_main.c
> index 8f9b3eb..b81620e 100644
> --- a/drivers/net/ethernet/qlogic/qede/qede_main.c
> +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
> @@ -676,7 +676,7 @@ static void qede_init_ndev(struct qede_dev *edev)
>   ndev->priv_flags |= IFF_UNICAST_FLT;
> 
>   /* user-changeble features */
> - hw_features = NETIF_F_GRO | NETIF_F_SG |
> + hw_features = NETIF_F_GRO | NETIF_F_GRO_HW | NETIF_F_SG |
> NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
> NETIF_F_TSO | NETIF_F_TSO6;
> 
> @@ -1515,7 +1515,7 @@ static void qede_init_fp(struct qede_dev *edev)
>edev->ndev->name, queue_id);
>   }
> 
> - edev->gro_disable = !(edev->ndev->features & NETIF_F_GRO);
> + edev->gro_disable = !(edev->ndev->features & NETIF_F_GRO_HW);
>  }
> 
>  static int qede_set_real_num_queues(struct qede_dev *edev)
> --
> 1.8.3.1

[PATCH net 2/2] MAINTAINERS: Remove Yotam from mlxfw

2017-10-30 Thread Yuval Mintz

Provide a mailing list for maintenance of the module instead.

Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index beac4ee..e2dc868 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8750,7 +8750,7 @@ Q:http://patchwork.ozlabs.org/project/netdev/list/
 F: drivers/net/ethernet/mellanox/mlxsw/
 
 MELLANOX FIRMWARE FLASH LIBRARY (mlxfw)
-M: Yotam Gigi <yot...@mellanox.com>
+M: ml...@mellanox.com
 L: netdev@vger.kernel.org
 S: Supported
 W: http://www.mellanox.com
-- 
2.4.3

[PATCH net 0/2] MAINTAINERS: Update Yotam Gigi details

2017-10-30 Thread Yuval Mintz

Yotam has left Mellanox [at least temporarily] and we want to thank
him for his effort and wish him best of luck in his future endeavors.

This patch comes to re-organize Yotam's  maintainer responsibilities -

  a. Yotam is planning to continue (co-)maintaining IFE & PSAMPLE,
 so we're simply removing his corporate E-mail, replacing it with
 his private one.

  b. For mlxfw we're providing a mailing list as a replacement for his
 contact details.

Dave,

Please consider applying this to 'net'.

Thanks,
Yuval


Yotam Gigi (1):
  MAINTAINERS: Update Yotam's E-mail

Yuval Mintz (1):
  MAINTAINERS: Remove Yotam from mlxfw

 MAINTAINERS| 6 +++---
 net/ife/ife.c  | 2 +-
 net/psample/psample.c  | 2 +-
 net/sched/act_sample.c | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

-- 
2.4.3

[PATCH net 1/2] MAINTAINERS: Update Yotam's E-mail

2017-10-30 Thread Yuval Mintz

From: Yotam Gigi <yotam...@gmail.com>

For the time being I will be available in my private mail. Update both the
MAINTAINERS file and the individual modules MODULE_AUTHOR directive with
the new address.

Signed-off-by: Yotam Gigi <yotam...@gmail.com>
Signed-off-by: Yuval Mintz <yuv...@mellanox.com>
---
 MAINTAINERS| 4 ++--
 net/ife/ife.c  | 2 +-
 net/psample/psample.c  | 2 +-
 net/sched/act_sample.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3e95dd7..beac4ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6677,7 +6677,7 @@ F:include/net/ieee802154_netdev.h
 F: Documentation/networking/ieee802154.txt
 
 IFE PROTOCOL
-M: Yotam Gigi <yot...@mellanox.com>
+M: Yotam Gigi <yotam...@gmail.com>
 M: Jamal Hadi Salim <j...@mojatatu.com>
 F: net/ife
 F: include/net/ife.h
@@ -10897,7 +10897,7 @@ S:  Maintained
 F: drivers/block/ps3vram.c
 
 PSAMPLE PACKET SAMPLING SUPPORT:
-M: Yotam Gigi <yot...@mellanox.com>
+M: Yotam Gigi <yotam...@gmail.com>
 S: Maintained
 F: net/psample
 F: include/net/psample.h
diff --git a/net/ife/ife.c b/net/ife/ife.c
index f360341..7d1ec76 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -137,6 +137,6 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 
dlen, const void *dval)
 EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
 MODULE_AUTHOR("Jamal Hadi Salim <j...@mojatatu.com>");
-MODULE_AUTHOR("Yotam Gigi <yot...@mellanox.com>");
+MODULE_AUTHOR("Yotam Gigi <yotam...@gmail.com>");
 MODULE_DESCRIPTION("Inter-FE LFB action");
 MODULE_LICENSE("GPL");
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 3a6ad0f..64f9562 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -296,6 +296,6 @@ static void __exit psample_module_exit(void)
 module_init(psample_module_init);
 module_exit(psample_module_exit);
 
-MODULE_AUTHOR("Yotam Gigi <yot...@mellanox.com>");
+MODULE_AUTHOR("Yotam Gigi <yotam...@gmail.com>");
 MODULE_DESCRIPTION("netlink channel for packet sampling");
 MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index a9f9a2cc..8b5abcd 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -271,6 +271,6 @@ static void __exit sample_cleanup_module(void)
 module_init(sample_init_module);
 module_exit(sample_cleanup_module);
 
-MODULE_AUTHOR("Yotam Gigi <yot...@mellanox.com>");
+MODULE_AUTHOR("Yotam Gigi <yotam...@gmail.com>");
 MODULE_DESCRIPTION("Packet sampling action");
 MODULE_LICENSE("GPL v2");
-- 
2.4.3

RE: [PATCH net-next v3 06/10] bnxt: Add devlink support for config get/set

2017-10-24 Thread Yuval Mintz

> +static int bnxt_nvm_read(struct bnxt *bp, int nvm_param, int idx,
> +  void *buf, int size)
> +{
> + struct hwrm_nvm_get_variable_input req = {0};
> + dma_addr_t dest_data_dma_addr;
> + void *dest_data_addr = NULL;
> + int bytesize;
> + int rc;
> +
> + bytesize = (size + 7) / BITS_PER_BYTE;
roundup?

..

+static int bnxt_nvm_write(struct bnxt *bp, int nvm_param, int idx,
> +   const void *buf, int size)
> +{
> + struct hwrm_nvm_set_variable_input req = {0};
> + dma_addr_t src_data_dma_addr;
> + void *src_data_addr = NULL;
> + int bytesize;
> + int rc;
> +
> + bytesize = (size + 7) / BITS_PER_BYTE;
Likewise

> +
> + src_data_addr = dma_alloc_coherent(>pdev->dev, bytesize,
> +_data_dma_addr,
> GFP_KERNEL);
> + if (!src_data_addr) {
> + netdev_err(bp->dev, "dma_alloc_coherent failure\n");

Won't you see an oom? Why do you need the print?

> +static int bnxt_dl_perm_config_set(struct devlink *devlink,
> +enum devlink_perm_config_param param,
> +u8 type, void *value, u8 *restart_reqd)
> +{
> + struct bnxt *bp = bnxt_get_bp_from_dl(devlink);
> + struct bnxt_drv_cfgparam *entry;
> + int idx = 0;
> + int ret = 0;
> + u32 bytesize;
> + u32 val32;
> + u16 val16;
> + u8 val8;
> + int i;
> +
> + *restart_reqd = 0;
> +
> + /* Find parameter in table */
> + for (i = 0; i < BNXT_NUM_DRV_CFGPARAM; i++) {
> + if (param == bnxt_drv_cfgparam_list[i].param) {
> + entry = _drv_cfgparam_list[i];
> + break;
> + }
> + }
> +
> + /* Not found */
> + if (i == BNXT_NUM_DRV_CFGPARAM)
> + return -EINVAL;
> +
Looks cleaner to check whether entry is set instead
...

> + bytesize = (entry->bitlength + 7) / BITS_PER_BYTE;

Roundup?

...

> + if (bytesize == 1) {
> + val8 = val32;

Don't you need explicit castings for these kind of assignments
to prevent warnings?

> + ret = bnxt_nvm_write(bp, entry->nvm_param, idx,
> ,
> +  entry->bitlength);
> + } else if (bytesize == 2) {
> + val16 = val32;
> + ret = bnxt_nvm_write(bp, entry->nvm_param, idx,
> ,
> +  entry->bitlength);
> + } else {
> + ret = bnxt_nvm_write(bp, entry->nvm_param, idx,
> ,
> +  entry->bitlength);
> + }
> + }
> +
> + /* Restart required for all nvm parameter writes */
> + *restart_reqd = 1;
> +
> + return ret;
> +}
> +
> +static int bnxt_dl_perm_config_get(struct devlink *devlink,
> +enum devlink_perm_config_param param,
> +u8 type, void *value)
> +{
Same comments as for the setter
...

> - if (!pci_find_ext_capability(bp->pdev, PCI_EXT_CAP_ID_SRIOV))
> - return 0;
> -
> - if (bp->hwrm_spec_code < 0x10800) {
> + if ((!pci_find_ext_capability(bp->pdev, PCI_EXT_CAP_ID_SRIOV)) ||
> + bp->hwrm_spec_code < 0x10800) {
> + /* eswitch switchdev mode not supported */
> + bnxt_dl_ops.eswitch_mode_set = NULL;
> + bnxt_dl_ops.eswitch_mode_get = NULL;

Why would you need to tie this interface to the presence of SRIOV in PCIe?
Also, Assuming the ability to disable sriov in #2 would cause this capability
not to be exposed after reboot, isn't this a one-way ticket?

RE: [PATCH net-next v3 01/10] devlink: Add permanent config parameter get/set operations

2017-10-24 Thread Yuval Mintz

> On Tue, Oct 24, 2017 at 5:22 PM, Yuval Mintz <yuv...@mellanox.com>
> wrote:
> >> Add support for permanent config parameter get/set commands. Used
> >> for persistent device configuration parameters.
> >>
> > ...
> >> + int (*perm_config_get)(struct devlink *devlink,
> >> +enum devlink_perm_config_param param, u8
> >> type,
> >> +void *value);
> >> + int (*perm_config_set)(struct devlink *devlink,
> >> +enum devlink_perm_config_param param, u8
> >> type,
> >> +void *value, u8 *restart_reqd);
> >>  };
> >> +static int devlink_nl_single_param_get(struct sk_buff *msg,
> >> +struct devlink *devlink,
> >> +u32 param, u8 type)
> >> +{
> >> + const struct devlink_ops *ops = devlink->ops;
> >> + struct nlattr *param_attr;
> >> + void *value;
> >> + u32 val;
> >> + int err;
> >> +
> >> + /* Allocate buffer for parameter value */
> >> + switch (type) {
> >> + case NLA_U8:
> >> + value = kmalloc(sizeof(u8), GFP_KERNEL);
> >> + break;
> >> + case NLA_U16:
> >> + value = kmalloc(sizeof(u16), GFP_KERNEL);
> >> + break;
> >> + case NLA_U32:
> >> + value = kmalloc(sizeof(u32), GFP_KERNEL);
> >> + break;
> >> + default:
> >> + return -EINVAL; /* Unsupported Type */
> >> + }
> >> +
> >> + if (!value)
> >> + return -ENOMEM;
> >> +
> >> + err = ops->perm_config_get(devlink, param, type, value);
> >> + if (err)
> >> + return err;
> >
> > I suspect this logic might be risky - its dependent on the driver to cast 
> > the
> > 'value' into the proper type or else, E.g., the following switch might break
> > for BE platforms.
> > Is there any reason to have the devlink <-> driver API be based on void*
> > and not on some typed data [of sufficient size]?
> > ...
> >> + switch (type) {
> >> + case NLA_U8:
> >> + val = *((u8 *)value);
> >> + if (nla_put_u8(msg, DEVLINK_ATTR_PERM_CONFIG_VALUE,
> >> val))
> >> + goto nest_err;
> >> + break;
> >> + case NLA_U16:
> >> + val = *((u16 *)value);
> >> + if (nla_put_u16(msg,
> >> DEVLINK_ATTR_PERM_CONFIG_VALUE, val))
> >> + goto nest_err;
> >> + break;
> >> + case NLA_U32:
> >> + val = *((u32 *)value);
> >> + if (nla_put_u32(msg,
> >> DEVLINK_ATTR_PERM_CONFIG_VALUE, val))
> >> + goto nest_err;
> >> + break;
> >> + }
> 
> Why might this break on a BE system?  It's not as though driver will
> be compiled LE and kernel BE or vice versa - as long as driver and
> kernel are same endian-ness, I would think it should be okay?

It depends on the driver implementation to cast your pointer to the right type.
E.g., driver needs to fill in a u8 data in *value for a given parameter.
If the driver casted the pointer to (u8*) everything is fine. But if he casted
it to (u32*) [naïve implementation that doesn't care about the type]
and filled it, then on a LE machine value[0] would contain the data while on
BE value[3] would contain it.


> 
> In general, the issue is that the parameter could be any of the
> netlink types (per Jiri's suggestion to the previous version of this
> patch).  So, we allocate some space, tell the driver the type we're
> expecting (the type argument to the perm_config_get() function), and
> yes, we rely on the driver to write something of the type we request
> to the pointer we provide.  Are you suggesting defining a union of
> U8/U16/U32, and passing a pointer to that for the driver to fill in?

Problem is that the driver-side could always use the biggest data-type
as long as it's working on a LE machine, but that approach would break
if same driver would be tried on a BE machine.
And the developer would have no way of knowing other than via code-review.

> The issue is that whatever the types we support now, we want future
> parameters to be able to be of arbitrary types.  Defining the
> interface to use the void pointer means that some future parameter can
> be of some other type, without having to update all the drivers using
> this API...
> 
> Or did I misunderstand your suggestion?

RE: [PATCH net-next v3 03/10] devlink: Adding num VFs per PF permanent config param

2017-10-24 Thread Yuval Mintz

> Adding DEVLINK_PERM_CONFIG_NUM_VF_PER_PF permanent config
> parameter.  Value is permanent, so becomes the new default
> value for this device.
> 
> The value sets the number of VFs per PF in SR-IOV mode.

Assuming it's meant to directly control the PCIe capability value
I think you should mention it explicitly in the commit message.

RE: [PATCH net-next v3 02/10] devlink: Adding SR-IOV enablement perm config param

2017-10-24 Thread Yuval Mintz

> Adding DEVLINK_PERM_CONFIG_SRIOV_ENABLED permanent config
> parameter.  Value is permanent, so becomes the new default
> value for this device.
> 
>   0 = Disable SR-IOV
>   1 = Enable SR-IOV

Does this imposes a requirement on the PCIe specifics, E.g., that the device
should no longer expose the SRIOV PCie capability?
Or does any implementation that would prevent a user from activating
SR-IOV is sufficient?

RE: [PATCH net-next v3 01/10] devlink: Add permanent config parameter get/set operations

2017-10-24 Thread Yuval Mintz

> Add support for permanent config parameter get/set commands. Used
> for persistent device configuration parameters.
> 
...
> + int (*perm_config_get)(struct devlink *devlink,
> +enum devlink_perm_config_param param, u8
> type,
> +void *value);
> + int (*perm_config_set)(struct devlink *devlink,
> +enum devlink_perm_config_param param, u8
> type,
> +void *value, u8 *restart_reqd);
>  };
> +static int devlink_nl_single_param_get(struct sk_buff *msg,
> +struct devlink *devlink,
> +u32 param, u8 type)
> +{
> + const struct devlink_ops *ops = devlink->ops;
> + struct nlattr *param_attr;
> + void *value;
> + u32 val;
> + int err;
> +
> + /* Allocate buffer for parameter value */
> + switch (type) {
> + case NLA_U8:
> + value = kmalloc(sizeof(u8), GFP_KERNEL);
> + break;
> + case NLA_U16:
> + value = kmalloc(sizeof(u16), GFP_KERNEL);
> + break;
> + case NLA_U32:
> + value = kmalloc(sizeof(u32), GFP_KERNEL);
> + break;
> + default:
> + return -EINVAL; /* Unsupported Type */
> + }
> +
> + if (!value)
> + return -ENOMEM;
> +
> + err = ops->perm_config_get(devlink, param, type, value);
> + if (err)
> + return err;

I suspect this logic might be risky - its dependent on the driver to cast the
'value' into the proper type or else, E.g., the following switch might break
for BE platforms.
Is there any reason to have the devlink <-> driver API be based on void*
and not on some typed data [of sufficient size]?
...
> + switch (type) {
> + case NLA_U8:
> + val = *((u8 *)value);
> + if (nla_put_u8(msg, DEVLINK_ATTR_PERM_CONFIG_VALUE,
> val))
> + goto nest_err;
> + break;
> + case NLA_U16:
> + val = *((u16 *)value);
> + if (nla_put_u16(msg,
> DEVLINK_ATTR_PERM_CONFIG_VALUE, val))
> + goto nest_err;
> + break;
> + case NLA_U32:
> + val = *((u32 *)value);
> + if (nla_put_u32(msg,
> DEVLINK_ATTR_PERM_CONFIG_VALUE, val))
> + goto nest_err;
> + break;
> + }

...
> +static int devlink_nl_single_param_set(struct sk_buff *msg,
> +struct devlink *devlink,
> +u32 param, u8 type, void *value)
> +{
> + const struct devlink_ops *ops = devlink->ops;
> + struct nlattr *cfgparam_attr;
> + u8 need_restart;
> + int err;
> +
> + /* Now set parameter */
> + err = ops->perm_config_set(devlink, param, type, value,
> _restart);
> + if (err)
> + return err;

Likewise

RE: [PATCH net-next v2 5/6] devlink: Adding num MSI-X vectors per VF NVRAM config param

2017-10-23 Thread Yuval Mintz

> >> >> On Fri, Oct 20, 2017 at 10:10 AM, Jiri Pirko  wrote:
> >> >> > Fri, Oct 20, 2017 at 04:03:55PM CEST, steven.l...@broadcom.com
> wrote:
> >> >> >>On Thu, Oct 19, 2017 at 5:39 PM, Jiri Pirko  wrote:
> >> >> >>> Thu, Oct 19, 2017 at 10:32:21PM CEST, yuv...@mellanox.com
> wrote:
> >> >> > Adding DEVLINK_PERM_CONFIG_MSIX_VECTORS_PER_VF
> >> >> permanent
> >> >> > config
> >> >> > parameter.  Defines number of MSI-X vectors allocated per VF.
> >> >> > Value is permanent (stored in NVRAM), so becomes the new
> >> default
> >> >> > value for this device.
> >> >> 
> >> >> Sounds like you're having this enforce the same configuration for
> all
> >> >> child VFs.
> >> >> >>>
> >> >> >>> Yeah, this sounds like per-port config.
> >> >> >>>
> >> >> >>
> >> >> >>Well, it gets a little tricky here.  I assume some cards handle this
> >> >> >>per-port.  Other cards might handle this per PF, where PF may not
> >> >> >>always correspond 1:1 with a port.  And some cards maybe just
> allow a
> >> >> >>single value for this parameter for the entire card, covering all
> >> >> >>ports/PFs.
> >> >> >>
> >> >> >>To keep things simple and as general as possible, it made sense to
> set
> >> >> >>all parameters on a per-PCI device level.  As I mentioned in my
> >> >> >>cover-letter, the devices most likely to use these proposed
> commands
> >> >> >>do not have a single "whole asic" PCI b/d/f with internal mechanism
> >> >> >>for accessing ports - most expose each port (and each function on
> each
> >> >> >>port) as a separate PCI b/d/f, with no separate "whole asic" PCI
> >> >> >>b/d/f.  That's how the BCM cards work, and I think that's how the
> >> MLNX
> >> >> >>cards work, and others that would be likely to use these cmds.
> >> >> >>
> >> >> >>So, to summarize, you direct the command to the PCI b/d/f you
> want
> >> to
> >> >> >>target.  Does this make sense?
> >> >> >
> >> >> > So you plan to have 1 devlink instance for each vf? Not sure that
> >> >> > does sound right to me :/
> >> >> >
> >> >>
> >> >> For the commands proposed in this patchset, AFAIK they all apply on a
> >> >> per-PF or broader, i.e. per-port or whole-card, granularity, since
> >> >> they affect permanent config that applies at boot-up.  So, no, the VFs
> >> >> don't really come into play here.
> >> >
> >> > Regardless of whether you're planning on having VFs as devlink
> instances,
> >> > the actual attribute question remains -
> >> > you're proposing an attribute that forces all VFs to have the same value.
> >> > This probably suits your PCI core limitations but other vendors might
> have
> >> > a different capability set, and accepting this design limitation now 
> >> > would
> >> > muck all future extension attempts of such attributes.
> >> >
> >> > I think VF configurations should be planned in advance for supporting a
> >> > per-VF Configuration whenever it's possible - even if not required
> >> [/possible]
> >> > by the one pushing the new attribute.
> >> >
> >>
> >> The commands being added in this patch are for permanent (i.e. NVRAM)
> >> config - essentially setting the new default values for various
> >> features of the device at boot-up.  At that initialization time, no
> >> VFs are yet instantiated.
> >>
> >> So my perspective was, in general (not just for our specific device /
> >> design), it doesn't seem like permanent config parameters would be set
> >> on individual VFs.  That was what my previous comment was trying to
> >> convey.
> >
> > That's an odd assumption; Why should you assume there's some device
> > that allows configuring persistent behavior for all VFs but think no other
> > would set the same on a per-VF basis?
> >
> >> If that assumption is wrong, though, and there is some device that has
> >> NVRAM config that is set per-VF, I assume the user would instantiate
> >> the VF and then call the devlink API on the pci device corresponding
> >> to the VF they with to affect, and I think the model proposed still
> >> works.
> >
> > What would be the purpose of re-configuring a value reflected in the
> > PCI device for an already instantiated VF?
> >
> >> Are you suggesting adding a mechanism to set NVRAM parameters on a
> >> per-VF basis, without instantiating the VF first?  I would prefer not
> >> adding such a mechanism unless/until there's a use case for it.
> >
> > The thing is that you're suggesting a new UAPI; We don't have the leisure
> > of pushing a partial implementation and changing it later on.
> 
> I hope we're not talking past each other because I'm not sure we're
> saying the same thing.  But if you have a device which has NVRAM
> config on an individual VF basis, and you want to be able to get/set
> that configuration without instantiating the VF first (i.e. without a
> PCI device to operate on), then one way to handle this is with a new
> attribute, DEVLINK_ATTR_PERM_CONFIG_VF_INDEX, for example.
> 
> It could be sent in the nested

RE: [PATCH net-next v2 5/6] devlink: Adding num MSI-X vectors per VF NVRAM config param

2017-10-23 Thread Yuval Mintz

> >> On Fri, Oct 20, 2017 at 10:10 AM, Jiri Pirko  wrote:
> >> > Fri, Oct 20, 2017 at 04:03:55PM CEST, steven.l...@broadcom.com wrote:
> >> >>On Thu, Oct 19, 2017 at 5:39 PM, Jiri Pirko  wrote:
> >> >>> Thu, Oct 19, 2017 at 10:32:21PM CEST, yuv...@mellanox.com wrote:
> >> > Adding DEVLINK_PERM_CONFIG_MSIX_VECTORS_PER_VF
> >> permanent
> >> > config
> >> > parameter.  Defines number of MSI-X vectors allocated per VF.
> >> > Value is permanent (stored in NVRAM), so becomes the new
> default
> >> > value for this device.
> >> 
> >> Sounds like you're having this enforce the same configuration for all
> >> child VFs.
> >> >>>
> >> >>> Yeah, this sounds like per-port config.
> >> >>>
> >> >>
> >> >>Well, it gets a little tricky here.  I assume some cards handle this
> >> >>per-port.  Other cards might handle this per PF, where PF may not
> >> >>always correspond 1:1 with a port.  And some cards maybe just allow a
> >> >>single value for this parameter for the entire card, covering all
> >> >>ports/PFs.
> >> >>
> >> >>To keep things simple and as general as possible, it made sense to set
> >> >>all parameters on a per-PCI device level.  As I mentioned in my
> >> >>cover-letter, the devices most likely to use these proposed commands
> >> >>do not have a single "whole asic" PCI b/d/f with internal mechanism
> >> >>for accessing ports - most expose each port (and each function on each
> >> >>port) as a separate PCI b/d/f, with no separate "whole asic" PCI
> >> >>b/d/f.  That's how the BCM cards work, and I think that's how the
> MLNX
> >> >>cards work, and others that would be likely to use these cmds.
> >> >>
> >> >>So, to summarize, you direct the command to the PCI b/d/f you want
> to
> >> >>target.  Does this make sense?
> >> >
> >> > So you plan to have 1 devlink instance for each vf? Not sure that
> >> > does sound right to me :/
> >> >
> >>
> >> For the commands proposed in this patchset, AFAIK they all apply on a
> >> per-PF or broader, i.e. per-port or whole-card, granularity, since
> >> they affect permanent config that applies at boot-up.  So, no, the VFs
> >> don't really come into play here.
> >
> > Regardless of whether you're planning on having VFs as devlink instances,
> > the actual attribute question remains -
> > you're proposing an attribute that forces all VFs to have the same value.
> > This probably suits your PCI core limitations but other vendors might have
> > a different capability set, and accepting this design limitation now would
> > muck all future extension attempts of such attributes.
> >
> > I think VF configurations should be planned in advance for supporting a
> > per-VF Configuration whenever it's possible - even if not required
> [/possible]
> > by the one pushing the new attribute.
> >
> 
> The commands being added in this patch are for permanent (i.e. NVRAM)
> config - essentially setting the new default values for various
> features of the device at boot-up.  At that initialization time, no
> VFs are yet instantiated.
> 
> So my perspective was, in general (not just for our specific device /
> design), it doesn't seem like permanent config parameters would be set
> on individual VFs.  That was what my previous comment was trying to
> convey.

That's an odd assumption; Why should you assume there's some device
that allows configuring persistent behavior for all VFs but think no other
would set the same on a per-VF basis?

> If that assumption is wrong, though, and there is some device that has
> NVRAM config that is set per-VF, I assume the user would instantiate
> the VF and then call the devlink API on the pci device corresponding
> to the VF they with to affect, and I think the model proposed still
> works.

What would be the purpose of re-configuring a value reflected in the
PCI device for an already instantiated VF?

> Are you suggesting adding a mechanism to set NVRAM parameters on a
> per-VF basis, without instantiating the VF first?  I would prefer not
> adding such a mechanism unless/until there's a use case for it.

The thing is that you're suggesting a new UAPI; We don't have the leisure
of pushing a partial implementation and changing it later on.

RE: [PATCH net-next v2 1/6] devlink: Add permanent config parameter get/set operations

2017-10-21 Thread Yuval Mintz

> On Thu, Oct 19, 2017 at 4:21 PM, Yuval Mintz <yuv...@mellanox.com>
> wrote:
> >> Subject: [PATCH net-next v2 1/6] devlink: Add permanent config
> parameter
> >> get/set operations
> >>
> >> Add support for permanent config parameter get/set commands. Used
> >> for parameters held in NVRAM, persistent device configuration.
> >
> > Given some of the attributes aren't Boolean, what about an API that
> > allows the user to learn of supported values per option?
> > Otherwise only way for configuring some of them would be trial & error.
> 
> Interesting suggestion.  There's a couple of places where this could
> be a factor.  (1) When a user wants to know what values are
> defined/available in the API, and (2) When the user wants to know what
> values are supported by a specific driver/device.
> 
> The intention for (1) is to push that into userspace.  The userspace
> devlink tool patches I am working on (not yet submitted) essentially
> mirror the config parameters and their options, with string "keywords"
> associated with each parameter and option, since it's the userspace
> app that will be parsing the command line strings and converting to
> API enums.  So the userspace app can provide the list of
> parameters/options it supports, which could be a subset of what's
> available in the API.
> 
> For (2), currently there is no mechanism other than trial/error as you
> suggest (up to driver to either return an error or else make use of
> the value specified by the user).  We could contemplate adding such a
> mechanism, but it's a little complicated as some options take a range
> (i.e. # of VFs per PF for example), and others may take one of a set
> of enumerated values (pre-boot link speed for example).
> 
> To clarify, are you suggesting some mechanism to allow a driver to
> report which parameters and options it supports (case (2))?  Or are
> you suggesting something in the kernel API to handle case (1) above?

I was thinking of (2). And I agree it would take some effort.

> 
> >
> >>
> >> Signed-off-by: Steve Lin <steven.l...@broadcom.com>
> >> Acked-by: Andy Gospodarek <go...@broadcom.com>
> >> ---
> >>  include/net/devlink.h|   3 +
> >>  include/uapi/linux/devlink.h |  11 ++
> >>  net/core/devlink.c   | 234
> >> +++
> >>  3 files changed, 248 insertions(+)
> >>
> >> diff --git a/include/net/devlink.h b/include/net/devlink.h
> >> index b9654e1..bd64623 100644
> >> --- a/include/net/devlink.h
> >> +++ b/include/net/devlink.h
> >> @@ -270,6 +270,9 @@ struct devlink_ops {
> >>   int (*eswitch_inline_mode_set)(struct devlink *devlink, u8
> >> inline_mode);
> >>   int (*eswitch_encap_mode_get)(struct devlink *devlink, u8
> >> *p_encap_mode);
> >>   int (*eswitch_encap_mode_set)(struct devlink *devlink, u8
> >> encap_mode);
> >> + int (*perm_config_get)(struct devlink *devlink, u32 param, u32
> >> *value);
> >> + int (*perm_config_set)(struct devlink *devlink, u32 param, u32
> >> value,
> >> +u8 *restart_reqd);
> >>  };
> >>
> >>  static inline void *devlink_priv(struct devlink *devlink)
> >> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> >> index 0cbca96..47cc584 100644
> >> --- a/include/uapi/linux/devlink.h
> >> +++ b/include/uapi/linux/devlink.h
> >> @@ -70,6 +70,10 @@ enum devlink_command {
> >>   DEVLINK_CMD_DPIPE_HEADERS_GET,
> >>   DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
> >>
> >> + /* Permanent (NVRAM) device config get/set */
> >> + DEVLINK_CMD_PERM_CONFIG_GET,
> >> + DEVLINK_CMD_PERM_CONFIG_SET,
> >> +
> >>   /* add new commands above here */
> >>   __DEVLINK_CMD_MAX,
> >>   DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
> >> @@ -202,6 +206,13 @@ enum devlink_attr {
> >>
> >>   DEVLINK_ATTR_ESWITCH_ENCAP_MODE,/* u8 */
> >>
> >> + /* Permanent Configuration Parameters */
> >> + DEVLINK_ATTR_PERM_CONFIGS,  /* nested */
> >> + DEVLINK_ATTR_PERM_CONFIG,   /* nested */
> >> + DEVLINK_ATTR_PERM_CONFIG_PARAMETER, /* u32 */
> >> + DEVLINK_ATTR_PERM_CONFIG_VALUE, /*
> >> u32 */
> >> + DEVLINK_ATTR_PERM_CONFIG_RESTART_REQUIRED,  /* u8 */
> >> +
> >>   /* a

RE: [PATCH net-next v2 5/6] devlink: Adding num MSI-X vectors per VF NVRAM config param

2017-10-21 Thread Yuval Mintz

> On Fri, Oct 20, 2017 at 10:10 AM, Jiri Pirko  wrote:
> > Fri, Oct 20, 2017 at 04:03:55PM CEST, steven.l...@broadcom.com wrote:
> >>On Thu, Oct 19, 2017 at 5:39 PM, Jiri Pirko  wrote:
> >>> Thu, Oct 19, 2017 at 10:32:21PM CEST, yuv...@mellanox.com wrote:
> > Adding DEVLINK_PERM_CONFIG_MSIX_VECTORS_PER_VF
> permanent
> > config
> > parameter.  Defines number of MSI-X vectors allocated per VF.
> > Value is permanent (stored in NVRAM), so becomes the new default
> > value for this device.
> 
> Sounds like you're having this enforce the same configuration for all
> child VFs.
> >>>
> >>> Yeah, this sounds like per-port config.
> >>>
> >>
> >>Well, it gets a little tricky here.  I assume some cards handle this
> >>per-port.  Other cards might handle this per PF, where PF may not
> >>always correspond 1:1 with a port.  And some cards maybe just allow a
> >>single value for this parameter for the entire card, covering all
> >>ports/PFs.
> >>
> >>To keep things simple and as general as possible, it made sense to set
> >>all parameters on a per-PCI device level.  As I mentioned in my
> >>cover-letter, the devices most likely to use these proposed commands
> >>do not have a single "whole asic" PCI b/d/f with internal mechanism
> >>for accessing ports - most expose each port (and each function on each
> >>port) as a separate PCI b/d/f, with no separate "whole asic" PCI
> >>b/d/f.  That's how the BCM cards work, and I think that's how the MLNX
> >>cards work, and others that would be likely to use these cmds.
> >>
> >>So, to summarize, you direct the command to the PCI b/d/f you want to
> >>target.  Does this make sense?
> >
> > So you plan to have 1 devlink instance for each vf? Not sure that
> > does sound right to me :/
> >
> 
> For the commands proposed in this patchset, AFAIK they all apply on a
> per-PF or broader, i.e. per-port or whole-card, granularity, since
> they affect permanent config that applies at boot-up.  So, no, the VFs
> don't really come into play here.

Regardless of whether you're planning on having VFs as devlink instances,
the actual attribute question remains -
you're proposing an attribute that forces all VFs to have the same value.
This probably suits your PCI core limitations but other vendors might have
a different capability set, and accepting this design limitation now would
muck all future extension attempts of such attributes.

I think VF configurations should be planned in advance for supporting a
per-VF Configuration whenever it's possible - even if not required [/possible]
by the one pushing the new attribute.

RE: [PATCH net-next v2 5/6] devlink: Adding num MSI-X vectors per VF NVRAM config param

2017-10-19 Thread Yuval Mintz

> Adding DEVLINK_PERM_CONFIG_MSIX_VECTORS_PER_VF permanent
> config
> parameter.  Defines number of MSI-X vectors allocated per VF.
> Value is permanent (stored in NVRAM), so becomes the new default
> value for this device.

Sounds like you're having this enforce the same configuration for all child VFs.

> 
> Signed-off-by: Steve Lin 
> Acked-by: Andy Gospodarek 
> ---
>  include/uapi/linux/devlink.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> index 8ad6c63..ef163b6 100644
> --- a/include/uapi/linux/devlink.h
> +++ b/include/uapi/linux/devlink.h
> @@ -260,6 +260,7 @@ enum devlink_perm_config_param {
>   DEVLINK_PERM_CONFIG_SRIOV_ENABLED,
>   DEVLINK_PERM_CONFIG_NUM_VF_PER_PF,
>   DEVLINK_PERM_CONFIG_MAX_NUM_PF_MSIX_VECT,
> + DEVLINK_PERM_CONFIG_MSIX_VECTORS_PER_VF,
>  };
> 
>  #endif /* _UAPI_LINUX_DEVLINK_H_ */
> --
> 2.7.4

RE: [PATCH net-next v2 1/6] devlink: Add permanent config parameter get/set operations

2017-10-19 Thread Yuval Mintz

> Subject: [PATCH net-next v2 1/6] devlink: Add permanent config parameter
> get/set operations
> 
> Add support for permanent config parameter get/set commands. Used
> for parameters held in NVRAM, persistent device configuration.

Given some of the attributes aren't Boolean, what about an API that
allows the user to learn of supported values per option?
Otherwise only way for configuring some of them would be trial & error.

> 
> Signed-off-by: Steve Lin 
> Acked-by: Andy Gospodarek 
> ---
>  include/net/devlink.h|   3 +
>  include/uapi/linux/devlink.h |  11 ++
>  net/core/devlink.c   | 234
> +++
>  3 files changed, 248 insertions(+)
> 
> diff --git a/include/net/devlink.h b/include/net/devlink.h
> index b9654e1..bd64623 100644
> --- a/include/net/devlink.h
> +++ b/include/net/devlink.h
> @@ -270,6 +270,9 @@ struct devlink_ops {
>   int (*eswitch_inline_mode_set)(struct devlink *devlink, u8
> inline_mode);
>   int (*eswitch_encap_mode_get)(struct devlink *devlink, u8
> *p_encap_mode);
>   int (*eswitch_encap_mode_set)(struct devlink *devlink, u8
> encap_mode);
> + int (*perm_config_get)(struct devlink *devlink, u32 param, u32
> *value);
> + int (*perm_config_set)(struct devlink *devlink, u32 param, u32
> value,
> +u8 *restart_reqd);
>  };
> 
>  static inline void *devlink_priv(struct devlink *devlink)
> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> index 0cbca96..47cc584 100644
> --- a/include/uapi/linux/devlink.h
> +++ b/include/uapi/linux/devlink.h
> @@ -70,6 +70,10 @@ enum devlink_command {
>   DEVLINK_CMD_DPIPE_HEADERS_GET,
>   DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
> 
> + /* Permanent (NVRAM) device config get/set */
> + DEVLINK_CMD_PERM_CONFIG_GET,
> + DEVLINK_CMD_PERM_CONFIG_SET,
> +
>   /* add new commands above here */
>   __DEVLINK_CMD_MAX,
>   DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
> @@ -202,6 +206,13 @@ enum devlink_attr {
> 
>   DEVLINK_ATTR_ESWITCH_ENCAP_MODE,/* u8 */
> 
> + /* Permanent Configuration Parameters */
> + DEVLINK_ATTR_PERM_CONFIGS,  /* nested */
> + DEVLINK_ATTR_PERM_CONFIG,   /* nested */
> + DEVLINK_ATTR_PERM_CONFIG_PARAMETER, /* u32 */
> + DEVLINK_ATTR_PERM_CONFIG_VALUE, /*
> u32 */
> + DEVLINK_ATTR_PERM_CONFIG_RESTART_REQUIRED,  /* u8 */
> +
>   /* add new attributes above here, update the policy in devlink.c */
> 
>   __DEVLINK_ATTR_MAX,
> diff --git a/net/core/devlink.c b/net/core/devlink.c
> index 7d430c1..c2cc7c6 100644
> --- a/net/core/devlink.c
> +++ b/net/core/devlink.c
> @@ -1566,6 +1566,224 @@ static int
> devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
>   return 0;
>  }
> 
> +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1];
> +
> +static int devlink_nl_single_param_get(struct sk_buff *msg,
> +struct devlink *devlink,
> +uint32_t param)
> +{
> + u32 value;
> + int err;
> + const struct devlink_ops *ops = devlink->ops;
> + struct nlattr *param_attr;
> +
> + err = ops->perm_config_get(devlink, param, );
> + if (err)
> + return err;
> +
> + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PERM_CONFIG);
> + nla_put_u32(msg, DEVLINK_ATTR_PERM_CONFIG_PARAMETER,
> param);
> + nla_put_u32(msg, DEVLINK_ATTR_PERM_CONFIG_VALUE, value);
> + nla_nest_end(msg, param_attr);
> +
> + return 0;
> +}
> +
> +static int devlink_nl_config_get_fill(struct sk_buff *msg,
> +   struct devlink *devlink,
> +   enum devlink_command cmd,
> +   struct genl_info *info)
> +{
> + void *hdr;
> + int err;
> + struct nlattr *attr;
> + int param_count = 0;
> + struct nlattr *cfgparam_attr;
> + int rem;
> + struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
> + u32 param;
> +
> + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
> +   _nl_family, 0, cmd);
> + if (!hdr) {
> + err = -EMSGSIZE;
> + goto nla_msg_failure;
> + }
> +
> + err = devlink_nl_put_handle(msg, devlink);
> + if (err)
> + goto nla_put_failure;
> +
> + if (!info->attrs[DEVLINK_ATTR_PERM_CONFIGS]) {
> + /* No configuration parameters */
> + goto nla_put_failure;
> + }
> +
> + cfgparam_attr = nla_nest_start(msg,
> DEVLINK_ATTR_PERM_CONFIGS);
> +
> + nla_for_each_nested(attr, info-
> >attrs[DEVLINK_ATTR_PERM_CONFIGS],
> + rem) {
Isn't it possible that a response for a single request for multiple ATTRs
wouldn't fit in a single message?

> + err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, attr,
>

RE: [PATCH 4/7] devlink: Adding perm config of link settings

2017-10-19 Thread Yuval Mintz

> On Thu, Oct 19, 2017 at 2:07 AM, Yuval Mintz <yuv...@mellanox.com>
> wrote:
> >> +enum devlink_autoneg_protocol {
> >> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY_BAM,
> >> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY_CONSORTIUM,
> >> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY,
> >> + DEVLINK_AUTONEG_PROTOCOL_BAM,   /* Broadcom
> >> Autoneg Mode */
> >> + DEVLINK_AUTONEG_PROTOCOL_CONSORTIUM,/*
> >> Consortium Autoneg Mode */
> >> +};
> >
> > Wouldn't adding BAM as a 'generic' mode of operation be like adding
> > non-consortium speeds to ethtool API?
> > [I profess ignorance in this area; For all I know it can be a widely 
> > accepted
> > industry standard]
> >
> 
> Yuval, I'm glad to get input from other NIC vendors.  
Other switch vendors ;)

>The high-level goal of this effort is to allow users of various vendors' NICs 
>to be
> able to configure these types of NVRAM/permanent/default settings
> using an inbox tool, rather than the collection of vendor-specific
> tools that is the status quo.
> 
> In order to provide that functionality, it seems like the
> vendor-specific parameters and also the vendor-specific settings of
> common parameters both need to be supported in this manner.
> 
> Ideally there will be much overlap in both the set of parameters
> available as well as the options for each parameter, but in the real
> world, there will always be differences between vendors and even
> between different devices (drivers) from the same vendor.  Despite
> that reality, I think there is still great benefit in having a common
> inbox tool that users can use for device configuration of this type.
> It just means that not all drivers will support all parameters, nor
> all options for each parameter that they do support.

I don't object the end-goal; I think my hesitation is due to the same
enum containing both generic and vendor-specific values mixed
together. I feel like we need some clear distinction between the two.

> 
> Thanks,
> Steve

RE: [PATCH 2/7] devlink: Adding NPAR permanent config parameters

2017-10-19 Thread Yuval Mintz

> >> DEVLINK_ATTR_PERM_CFG_NPAR_BW_RESERVATION_VALID: 1 to use
> >> BW_RESERVATION setting, 0 to ignore.
> >>
> > ...
> >> DEVLINK_ATTR_PERM_CFG_NPAR_BW_LIMIT_VALID: 1 to use BW_LIMIT
> >> setting, 0 to ignore.
> >
> > While it probably ties to different fields in your NVM layout why would the
> user
> > require specific attributes for these? Why not have values in the actual
> > attributes indicating of this status?
> 
> Hi Yuval,
> 
> Does having the separate valid flag present any difficulties?  There
> are lots of implementation options here (a limit or reservation value
> of 0 could mean invalid, or we could define (1 << 31) to be a valid
> flag when setting the value, etc.), and I'm not necessarily tied to
> doing it this way, but it seemed a straightforward way to represent
> the validity of the other field.

You're pushing a LOT of new attributes, every one of which is going
to have to be documented for future generations.
I think whenever it's possible to drop an unnecessary attribute, that
would be the better option.

RE: [PATCH 2/7] devlink: Adding NPAR permanent config parameters

2017-10-19 Thread Yuval Mintz

> DEVLINK_ATTR_PERM_CFG_NPAR_BW_RESERVATION_VALID: 1 to use
> BW_RESERVATION setting, 0 to ignore.
> 
...
> DEVLINK_ATTR_PERM_CFG_NPAR_BW_LIMIT_VALID: 1 to use BW_LIMIT
> setting, 0 to ignore.

While it probably ties to different fields in your NVM layout why would the user
require specific attributes for these? Why not have values in the actual
attributes indicating of this status?

RE: [PATCH 3/7] devlink: Adding high level dev perm config params

2017-10-19 Thread Yuval Mintz

> DEVLINK_ATTR_PERM_CFG_MULTIFUNC_MODE: Configure multi-function
> mode; use devlink_multifunc_mode.
...
> +enum devlink_multifunc_mode {
> + DEVLINK_MULTIFUNC_MODE_ALLOWED, /* Ext switch
> activates MF */
> + DEVLINK_MULTIFUNC_MODE_FORCE_SINGFUNC,
> + DEVLINK_MULTIFUNC_MODE_NPAR10,  /* NPAR 1.0
> */
> + DEVLINK_MULTIFUNC_MODE_NPAR15,  /* NPAR 1.5
> */
> + DEVLINK_MULTIFUNC_MODE_NPAR20,  /* NPAR 2.0
> */
> +};

... And when someone would invent a new partitioning scheme, what then?
You'd extend the 'generic' modes?
It's not very generic.

> DEVLINK_ATTR_PERM_CFG_SECURE_NIC_ENABLED: 1 to enable Secure NIC
> functionality, 0 to disable.

What does it mean? Is there some spec explaining it?

RE: [PATCH 5/7] devlink: Adding pre-boot permanent config parameters

2017-10-19 Thread Yuval Mintz

> DEVLINK_ATTR_PERM_CFG_MBA_LINK_SPEED: Configured link speed
> while executing MBA host software (PXI/iSCSI); use enum
> devlink_mba_link_speed.

#4 introduces:

> DEVLINK_ATTR_PERM_CFG_PRE_OS_LINK_SPEED_D0: Configure default
> pre-OS link speed in full power (D0) state; use enum
> devlink_pre_os_link_speed.

> DEVLINK_ATTR_PERM_CFG_PRE_OS_LINK_SPEED_D3: Configure default
> pre-OS link speed in sleep (D3) state; use enum
> devlink_pre_os_link_speed.

Why would user need an additional value for the MBA speed?

RE: [PATCH 4/7] devlink: Adding perm config of link settings

2017-10-19 Thread Yuval Mintz

> +enum devlink_autoneg_protocol {
> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY_BAM,
> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY_CONSORTIUM,
> + DEVLINK_AUTONEG_PROTOCOL_IEEE8023BY,
> + DEVLINK_AUTONEG_PROTOCOL_BAM,   /* Broadcom
> Autoneg Mode */
> + DEVLINK_AUTONEG_PROTOCOL_CONSORTIUM,/*
> Consortium Autoneg Mode */
> +};

Wouldn't adding BAM as a 'generic' mode of operation be like adding
non-consortium speeds to ethtool API?
[I profess ignorance in this area; For all I know it can be a widely accepted
industry standard]

RE: [PATCH net-next 0/2] Add mqprio hardware offload support in hns3 driver

2017-10-16 Thread Yuval Mintz

> Hi, Yuval
> 
> On 2017/10/15 13:14, Yuval Mintz wrote:
> >> Hi, Yuval
> >>
> >> On 2017/10/13 4:21, Yuval Mintz wrote:
> >>>> This patchset adds a new hardware offload type in mqprio before
> adding
> >>>> mqprio hardware offload support in hns3 driver.
> >>>
> >>> I think one of the biggest issues in tying this to DCB configuration is 
> >>> the
> >>> non-immediate [and possibly non persistent] configuration.
> >>>
> >>> Scenario #1:
> >>> User is configuring mqprio offloaded with 3 TCs while device is in willing
> >> mode.
> >>> Would you expect the driver to immediately respond with a success or
> >> instead
> >>> delay the return until the DCBx negotiation is complete and the
> operational
> >>> num of TCs is actually 3?
> >>
> >> Well, when user requsts the mqprio offloaded by a hardware shared by
> DCB,
> >> I expect
> >> the user is not using the dcb tool.
> >> If user is still using dcb tool, then result is undefined.
> >>
> >> The scenario you mention maybe can be enforced by setting willing to
> zero
> >> when user
> >> is requesting the mqprio offload, and restore the willing bit when
> unloaded
> >> the mqprio
> >> offload.
> >
> > Sounds a bit harsh but would probably work.
> >
> >> But I think the real issue is that dcb and mqprio shares the tc system in 
> >> the
> >> stack,
> >> the problem may be better to be fixed in the stack rather than in the
> driver,
> >> as you
> >> suggested in the DCB patchset. What do you think?
> >
> > What did you have in mind?
> 
> I was thinking maybe the tc system can provide a notification to mqprio and
> dcb.
> mqprio and dcb register a callback to the tc system, when there is some
> change of
> tc configuration, the tc system call the callback from mqprio and dcb.
> 
> >
> >>
> >>>
> >>> Scenario #2:
> >>> Assume user explicitly offloaded mqprio with 3 TCs, but now DCB
> >> configuration
> >>> has changed on the peer side and 4 TCs is the new negotiated
> operational
> >> value.
> >>> Your current driver logic would change the number of TCs underneath
> the
> >> user
> >>> configuration [and it would actually probably work due to mqprio being a
> >> crappy
> >>> qdisc]. But was that the user actual intention?
> >>> [I think the likely answer in this scenario is 'yes' since the 
> >>> alternative is no
> >> better.
> >>> But I still thought it was worth mentioning]
> >>
> >> You are right, the problem also have something to do with mqprio and dcb
> >> sharing
> >> the tc in the stack.
> >>
> >> Druing testing, when user explicitly offloaded mqprio with 3 TCs, all
> >> queue has a default pfifo mqprio attached, after DCB changes the tc num
> to
> >> 4,
> >> using tc qdisc shows some queue does not have a default pfifo mqprio
> >> attached.
> >
> > Really? Then what did it show?
> > [I assume it has some pfifo attached, and it's an mqprio dump kind of an
> issue]
> 
> When queue size of the ndev is 16 and tc num is 3, we set the real queue size
> to
> 15 ( 5 * 3 = 15), mqprio only attach pfifo to the first 15 queue, when tc num
> change
> to 4 by DCB, we set the real queue size to 16 (4 * 4 = 16).
> So tc qdisc shows the last queue has no qdisc attached.

So there is a qdisc attached - mqprio_attach() attches to all transmission
queues [num_tx_queues] and not only the active ones.
But the flow for mqprio might be lacking the additional qdisc_hash_add()
for the additional queue's qdisc.

> 
> >
> >>
> >> Maybe we can add a callback to notify mqprio the configuration has
> changed.
> >>
> >
> > Which would do what?
> > You already have the notifications available for monitoring using dcbnl 
> > logic
> if the
> > configuration change [for user]; So user can re-configure whatever it
> wants.
> 
> Yes, if user is only using dcb tool.
> 
> > But other than dropping all the qdisc configurations and going back to the
> default
> > qdiscs, what default action would mqprio be able to do when configuration
> changes
> > that actually makes sense?
> 
> As explained above, after dcb changing the configuration, some queue may
> have no qdisc
> attached, so I was thinking maybe we can add pfifo to it if there is no qdsic
> attached
> to it.
> 
> Thanks,
> Yunsheng Lin
> 
> >
> >> Thanks
> >> Yunsheng Lin
> >>
> >>>
> >>> Cheers,
> >>> Yuval
> >>>
> >>>>
> >>>> Yunsheng Lin (2):
> >>>>   mqprio: Add a new hardware offload type in mqprio
> >>>>   net: hns3: Add mqprio hardware offload support in hns3 driver
> >>>>
> >>>>  drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
> >>>>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 23
> +++
> >>>>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 46
> >> ++-
> >>>> ---
> >>>>  include/uapi/linux/pkt_sched.h |  1 +
> >>>>  4 files changed, 55 insertions(+), 16 deletions(-)
> >>>>
> >>>> --
> >>>> 1.9.1
> >>>
> >>>
> >>>
> >

RE: [PATCH net-next 0/2] Add mqprio hardware offload support in hns3 driver

2017-10-15 Thread Yuval Mintz

> > >> This patchset adds a new hardware offload type in mqprio before
> adding
> > >> mqprio hardware offload support in hns3 driver.

Apparently Dave has already acceptedAmirtha's changes to mqprio:
https://marc.info/?l=linux-netdev=150803219824053=2 
so I guess you need to revise your patchs to align to the new conventions.

> > >
> > > I think one of the biggest issues in tying this to DCB configuration is 
> > > the
> > > non-immediate [and possibly non persistent] configuration.
> > >
> > > Scenario #1:
> > > User is configuring mqprio offloaded with 3 TCs while device is in willing
> > mode.
> > > Would you expect the driver to immediately respond with a success or
> > instead
> > > delay the return until the DCBx negotiation is complete and the
> operational
> > > num of TCs is actually 3?
> >
> > Well, when user requsts the mqprio offloaded by a hardware shared by
> DCB,
> > I expect
> > the user is not using the dcb tool.
> > If user is still using dcb tool, then result is undefined.
> >
> > The scenario you mention maybe can be enforced by setting willing to zero
> > when user
> > is requesting the mqprio offload, and restore the willing bit when unloaded
> > the mqprio
> > offload.
> 
> Sounds a bit harsh but would probably work.
> 
> > But I think the real issue is that dcb and mqprio shares the tc system in 
> > the
> > stack,
> > the problem may be better to be fixed in the stack rather than in the
> driver,
> > as you
> > suggested in the DCB patchset. What do you think?
> 
> What did you have in mind?
> 
> >
> > >
> > > Scenario #2:
> > > Assume user explicitly offloaded mqprio with 3 TCs, but now DCB
> > configuration
> > > has changed on the peer side and 4 TCs is the new negotiated operational
> > value.
> > > Your current driver logic would change the number of TCs underneath
> the
> > user
> > > configuration [and it would actually probably work due to mqprio being a
> > crappy
> > > qdisc]. But was that the user actual intention?
> > > [I think the likely answer in this scenario is 'yes' since the 
> > > alternative is no
> > better.
> > > But I still thought it was worth mentioning]
> >
> > You are right, the problem also have something to do with mqprio and dcb
> > sharing
> > the tc in the stack.
> >
> > Druing testing, when user explicitly offloaded mqprio with 3 TCs, all
> > queue has a default pfifo mqprio attached, after DCB changes the tc num
> to
> > 4,
> > using tc qdisc shows some queue does not have a default pfifo mqprio
> > attached.
> 
> Really? Then what did it show?
> [I assume it has some pfifo attached, and it's an mqprio dump kind of an
> issue]
> 
> >
> > Maybe we can add a callback to notify mqprio the configuration has
> changed.
> >
> 
> Which would do what?
> You already have the notifications available for monitoring using dcbnl logic 
> if
> the
> configuration change [for user]; So user can re-configure whatever it wants.
> But other than dropping all the qdisc configurations and going back to the
> default
> qdiscs, what default action would mqprio be able to do when configuration
> changes
> that actually makes sense?
> 
> > Thanks
> > Yunsheng Lin
> >
> > >
> > > Cheers,
> > > Yuval
> > >
> > >>
> > >> Yunsheng Lin (2):
> > >>   mqprio: Add a new hardware offload type in mqprio
> > >>   net: hns3: Add mqprio hardware offload support in hns3 driver
> > >>
> > >>  drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
> > >>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 23 +++
> > >>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 46
> > ++-
> > >> ---
> > >>  include/uapi/linux/pkt_sched.h |  1 +
> > >>  4 files changed, 55 insertions(+), 16 deletions(-)
> > >>
> > >> --
> > >> 1.9.1
> > >
> > >
> > >

RE: [PATCH net-next 0/2] Add mqprio hardware offload support in hns3 driver

2017-10-14 Thread Yuval Mintz

> Hi, Yuval
> 
> On 2017/10/13 4:21, Yuval Mintz wrote:
> >> This patchset adds a new hardware offload type in mqprio before adding
> >> mqprio hardware offload support in hns3 driver.
> >
> > I think one of the biggest issues in tying this to DCB configuration is the
> > non-immediate [and possibly non persistent] configuration.
> >
> > Scenario #1:
> > User is configuring mqprio offloaded with 3 TCs while device is in willing
> mode.
> > Would you expect the driver to immediately respond with a success or
> instead
> > delay the return until the DCBx negotiation is complete and the operational
> > num of TCs is actually 3?
> 
> Well, when user requsts the mqprio offloaded by a hardware shared by DCB,
> I expect
> the user is not using the dcb tool.
> If user is still using dcb tool, then result is undefined.
> 
> The scenario you mention maybe can be enforced by setting willing to zero
> when user
> is requesting the mqprio offload, and restore the willing bit when unloaded
> the mqprio
> offload.

Sounds a bit harsh but would probably work.

> But I think the real issue is that dcb and mqprio shares the tc system in the
> stack,
> the problem may be better to be fixed in the stack rather than in the driver,
> as you
> suggested in the DCB patchset. What do you think?

What did you have in mind?

> 
> >
> > Scenario #2:
> > Assume user explicitly offloaded mqprio with 3 TCs, but now DCB
> configuration
> > has changed on the peer side and 4 TCs is the new negotiated operational
> value.
> > Your current driver logic would change the number of TCs underneath the
> user
> > configuration [and it would actually probably work due to mqprio being a
> crappy
> > qdisc]. But was that the user actual intention?
> > [I think the likely answer in this scenario is 'yes' since the alternative 
> > is no
> better.
> > But I still thought it was worth mentioning]
> 
> You are right, the problem also have something to do with mqprio and dcb
> sharing
> the tc in the stack.
> 
> Druing testing, when user explicitly offloaded mqprio with 3 TCs, all
> queue has a default pfifo mqprio attached, after DCB changes the tc num to
> 4,
> using tc qdisc shows some queue does not have a default pfifo mqprio
> attached.

Really? Then what did it show? 
[I assume it has some pfifo attached, and it's an mqprio dump kind of an issue]

> 
> Maybe we can add a callback to notify mqprio the configuration has changed.
> 

Which would do what?
You already have the notifications available for monitoring using dcbnl logic 
if the
configuration change [for user]; So user can re-configure whatever it wants.
But other than dropping all the qdisc configurations and going back to the 
default
qdiscs, what default action would mqprio be able to do when configuration 
changes
that actually makes sense?

> Thanks
> Yunsheng Lin
> 
> >
> > Cheers,
> > Yuval
> >
> >>
> >> Yunsheng Lin (2):
> >>   mqprio: Add a new hardware offload type in mqprio
> >>   net: hns3: Add mqprio hardware offload support in hns3 driver
> >>
> >>  drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
> >>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 23 +++
> >>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 46
> ++-
> >> ---
> >>  include/uapi/linux/pkt_sched.h |  1 +
> >>  4 files changed, 55 insertions(+), 16 deletions(-)
> >>
> >> --
> >> 1.9.1
> >
> >
> >

RE: [PATCH net-next 0/2] Add mqprio hardware offload support in hns3 driver

2017-10-12 Thread Yuval Mintz

> This patchset adds a new hardware offload type in mqprio before adding
> mqprio hardware offload support in hns3 driver.

I think one of the biggest issues in tying this to DCB configuration is the
non-immediate [and possibly non persistent] configuration.

Scenario #1:
User is configuring mqprio offloaded with 3 TCs while device is in willing mode.
Would you expect the driver to immediately respond with a success or instead
delay the return until the DCBx negotiation is complete and the operational
num of TCs is actually 3?

Scenario #2:
Assume user explicitly offloaded mqprio with 3 TCs, but now DCB configuration
has changed on the peer side and 4 TCs is the new negotiated operational value.
Your current driver logic would change the number of TCs underneath the user
configuration [and it would actually probably work due to mqprio being a crappy
qdisc]. But was that the user actual intention?
[I think the likely answer in this scenario is 'yes' since the alternative is 
no better.
But I still thought it was worth mentioning]

Cheers,
Yuval

> 
> Yunsheng Lin (2):
>   mqprio: Add a new hardware offload type in mqprio
>   net: hns3: Add mqprio hardware offload support in hns3 driver
> 
>  drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 23 +++
>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 46 ++-
> ---
>  include/uapi/linux/pkt_sched.h |  1 +
>  4 files changed, 55 insertions(+), 16 deletions(-)
> 
> --
> 1.9.1

RE: [PATCH net-next 1/2] mqprio: Add a new hardware offload type in mqprio

2017-10-12 Thread Yuval Mintz

> When a driver supports both dcb and hardware offloaded mqprio, and
> user is running mqprio and dcb tool concurrently, the configuration
> set by each tool may be conflicted with each other because the dcb
(for second 'each') s/each/the

> and mqprio may be using the same hardwere offload component and share
s/hardwere/hardware

> the tc system in the network stack.
> 
> This patch adds a new offload type to indicate that the underlying
> driver offload prio mapping as part of DCB. If the driver would be
'should' offload

> incapable of that it would refuse the offload. User would then have
> to explicitly request that qdisc offload.

RE: [net-next 11/15] i40evf: Enable VF to request an alternate queue allocation

2017-10-02 Thread Yuval Mintz

> + case VIRTCHNL_OP_REQUEST_QUEUES: {
> + struct virtchnl_vf_res_request *vfres =
> + (struct virtchnl_vf_res_request *)msg;
> + if (vfres->num_queue_pairs == adapter->num_req_queues)
> {
> + adapter->flags |=
> I40EVF_FLAG_REINIT_ITR_NEEDED;
> + i40evf_schedule_reset(adapter);
> + } else {
> + dev_info(>pdev->dev,
> +  "Requested %d queues, PF can support
> %d\n",
> +  adapter->num_req_queues,
> +  vfres->num_queue_pairs);
> + adapter->num_req_queues = 0;
> + }
> + }
> + break;

Something is odd about your parenthesis.

>   default:
>   if (adapter->current_op && (v_opcode != adapter-
> >current_op))
>   dev_warn(>pdev->dev, "Expected
> response %d from PF, received %d\n",
> --
> 2.14.2

RE: [patch net-next 1/7] skbuff: Add the offload_mr_fwd_mark field

2017-09-29 Thread Yuval Mintz

> hello Jiri and Yotam,
> 
> On Thu, 2017-09-28 at 19:34 +0200, Jiri Pirko wrote:
> > From: Yotam Gigi 
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 19e64bf..ada8214 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -772,6 +772,7 @@ struct sk_buff {
> > __u8remcsum_offload:1;
> >  #ifdef CONFIG_NET_SWITCHDEV
> > __u8offload_fwd_mark:1;
> > +   __u8offload_mr_fwd_mark:1;
> 
> I had a look at the pahole output:
> 
> $ make allyesconfig
> $ make net/core/skbuff.o
> $ pahole net/core/skbuff.o | grep -C7 tc_from_ingress
> 
> 
> __u8   ipvs_property:1;  /*   147: 7  1 */
> __u8   inner_protocol_type:1; /*   147: 6  1 */
> __u8   remcsum_offload:1;/*   147: 5  1 */
> __u8   offload_fwd_mark:1;   /*   147: 4  1 */
> __u8   tc_skip_classify:1;   /*   147: 3  1 */
> __u8   tc_at_ingress:1;  /*   147: 2  1 */
> __u8   tc_redirected:1;  /*   147: 1  1 */
> __u8   tc_from_ingress:1;/*   147: 0  1 */
> __u16  tc_index; /*   148 2 */
> 
> /* XXX 2 bytes hole, try to pack */
> 
> union {
> __wsum csum; /*   4 */
> struct {
> 
> apparently there are no more spare bits to use at that offset: therefore,
> adding 'offload_mr_fwd_mark' before 'tc_skip_classify' will make
> 'tc_from_ingress' slip at offset 148, and tc_index at offset 150.
> I think you can use that 2-bytes hole below tc_index, and also move the
> offload_fwd_mark bit there, as we use both when
> CONFIG_NET_SWITCHDEV is
> enabled. This way we will also gain one spare bit, without changing the
> struct size or worsening the cacheline alignments.
> 
> what do you think?

Your pahole output still shows a 2B hole until the following union
which is 4B-aligned.
While it's true tc_index moves to offset 150, the union will not move 
[I.e., stay at offset 152] so the layout doesn't really change [greatly]
nor the size of the struct. And we have the benefit of all the bits
remaining consecutive.

RE: [PATCH v2 net-next 10/10] net: hns3: Add mqprio support when interacting with network stack

2017-09-26 Thread Yuval Mintz

> Hi, Yuval
> 
> On 2017/9/26 14:43, Yuval Mintz wrote:
> >> When using tc qdisc to configure DCB parameter, dcb_ops->setup_tc
> >> is used to tell hclge_dcb module to do the setup.
> >
> > While this might be a step in the right direction, this causes an 
> > inconsistency
> > in user experience - Some [well, most] vendors didn't allow the mqprio
> > priority mapping to affect DCB, instead relying on the dcbnl functionality
> > to control that configuration.
> >
> > A couple of options to consider:
> >   - Perhaps said logic shouldn't be contained inside the driver but rather
> >  in mqprio logic itself. I.e., rely on DCBNL functionality [if 
> > available] from
> >  within mqprio and try changing the configuration.
> 
> In net/dcb/dcbnl.c
> dcbnl_ieee_set already call dcbnl_ieee_notify to notify the user space
> configuration has changed, does this dcbnl_ieee_notify function do the
> job for us? I am not sure if lldpad has registered for this notifition.

Not that familiar with the dcbnl calls; Shouldn't dcbnl_setall be called to
make the configuration apply [or is that only for ieee]?
Regardless, don't know if it makes sense to assume user-application would
fix the qdisc configuration by notification while dcbnl logic in kernel could 
have
done that instead.

> As you suggested below, can we add a new TC_MQPRIO_HW_OFFLOAD_
> value to
> reflect that the configuration is needed to be changed by dcbnl_ieee_set
> (perhaps some other function) in dcbnl?
> Do you think it is feasible?

Either I'm miseading your answer or we think of it from 2 opposite end.
I was thinking that the new offloaded flag would indicate to the underlying
driver that it's expected to offload the prio mapping [as part of DCB].
If the driver would be incapable of that it would refuse the offload.
User would then have to explicitly request that the qdisc offload.

> 
> 
> >   - Add a new TC_MQPRIO_HW_OFFLOAD_ value to explicitly reflect user
> >  request to allow this configuration to affect DCB.
> >
> >> When using lldptool to configure DCB parameter, hclge_dcb module
> >> call the client_ops->setup_tc to tell network stack which queue
> >> and priority is using for specific tc.
> >
> > You're basically bypassing the mqprio logic.
> > Since you're configuring the prio->queue mapping from DCB flow,
> > you'll get an mqprio-like behavior [meaning a transmitted packet
> > would reach a transmission queue associated with its priority] even
> > if device wasn't grated with an mqprio qdisc.
> > Why should your user even use mqprio? What benefit does he get from it?
> >
> > ...
> >
> >> +static int hns3_nic_set_real_num_queue(struct net_device *netdev)
> >> +{
> >> +  struct hns3_nic_priv *priv = netdev_priv(netdev);
> >> +  struct hnae3_handle *h = priv->ae_handle;
> >> +  struct hnae3_knic_private_info *kinfo = >kinfo;
> >> +  unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
> >> +  int ret;
> >> +
> >> +  ret = netif_set_real_num_tx_queues(netdev, queue_size);
> >> +  if (ret) {
> >> +  netdev_err(netdev,
> >> + "netif_set_real_num_tx_queues fail, ret=%d!\n",
> >> + ret);
> >> +  return ret;
> >> +  }
> >> +
> >> +  ret = netif_set_real_num_rx_queues(netdev, queue_size);
> >
> > I don't think you're changing the driver behavior, but why are you setting
> > the real number of rx queues based on the number of TCs?
> > Do you actually open (TC x RSS) Rx queues?
> >
> > .
> >

RE: [PATCH v2 net-next 10/10] net: hns3: Add mqprio support when interacting with network stack

2017-09-26 Thread Yuval Mintz

> When using tc qdisc to configure DCB parameter, dcb_ops->setup_tc
> is used to tell hclge_dcb module to do the setup.

While this might be a step in the right direction, this causes an inconsistency
in user experience - Some [well, most] vendors didn't allow the mqprio
priority mapping to affect DCB, instead relying on the dcbnl functionality
to control that configuration.

A couple of options to consider:
  - Perhaps said logic shouldn't be contained inside the driver but rather
 in mqprio logic itself. I.e., rely on DCBNL functionality [if available] 
from
 within mqprio and try changing the configuration. 
  - Add a new TC_MQPRIO_HW_OFFLOAD_ value to explicitly reflect user
 request to allow this configuration to affect DCB.

> When using lldptool to configure DCB parameter, hclge_dcb module
> call the client_ops->setup_tc to tell network stack which queue
> and priority is using for specific tc.

You're basically bypassing the mqprio logic.
Since you're configuring the prio->queue mapping from DCB flow,
you'll get an mqprio-like behavior [meaning a transmitted packet
would reach a transmission queue associated with its priority] even
if device wasn't grated with an mqprio qdisc.
Why should your user even use mqprio? What benefit does he get from it?

...

> +static int hns3_nic_set_real_num_queue(struct net_device *netdev)
> +{
> + struct hns3_nic_priv *priv = netdev_priv(netdev);
> + struct hnae3_handle *h = priv->ae_handle;
> + struct hnae3_knic_private_info *kinfo = >kinfo;
> + unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
> + int ret;
> +
> + ret = netif_set_real_num_tx_queues(netdev, queue_size);
> + if (ret) {
> + netdev_err(netdev,
> +"netif_set_real_num_tx_queues fail, ret=%d!\n",
> +ret);
> + return ret;
> + }
> +
> + ret = netif_set_real_num_rx_queues(netdev, queue_size);

I don't think you're changing the driver behavior, but why are you setting
the real number of rx queues based on the number of TCs?
Do you actually open (TC x RSS) Rx queues?

[PATCH] net: Remove ndo_dfwd_start_xmit

2017-06-25 Thread Yuval Mintz

Looks like commit f663dd9aaf9e ("net: core: explicitly select a txq before 
doing l2 forwarding")
has removed the need for this dedicated xmit function [it even explicitly
states so in its commit log message] but it hasn't removed the definition
of the ndo.

Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
CC: Jason Wang <jasow...@redhat.com>
CC: John Fastabend <john.r.fastab...@intel.com>
---
 include/linux/netdevice.h | 9 -
 1 file changed, 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 68f5d89..85f01d6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1114,12 +1114,6 @@ struct xfrmdev_ops {
  * by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
  * the station and priv is the structure returned by the add
  * operation.
- * netdev_tx_t (*ndo_dfwd_start_xmit)(struct sk_buff *skb,
- *   struct net_device *dev,
- *   void *priv);
- * Callback to use for xmit over the accelerated station. This
- * is used in place of ndo_start_xmit on accelerated net
- * devices.
  * int (*ndo_set_tx_maxrate)(struct net_device *dev,
  *  int queue_index, u32 maxrate);
  * Called when a user wants to set a max-rate limitation of specific
@@ -1316,9 +1310,6 @@ struct net_device_ops {
void(*ndo_dfwd_del_station)(struct net_device *pdev,
void *priv);
 
-   netdev_tx_t (*ndo_dfwd_start_xmit) (struct sk_buff *skb,
-   struct net_device *dev,
-   void *priv);
int (*ndo_get_lock_subclass)(struct net_device 
*dev);
int (*ndo_set_tx_maxrate)(struct net_device *dev,
  int queue_index,
-- 
2.9.4

[PATCH net] bnx2x: Don't log mc removal needlessly

2017-06-24 Thread Yuval Mintz

When mc configuration changes bnx2x_config_mcast() can return 0 for
success, negative for failure and positive for benign reason preventing
its immediate work, e.g., when the command awaits the completion of
a previously sent command.

When removing all configured macs on a 578xx adapter, if a positive
value would be returned driver would errneously log it as an error.

Fixes: c7b7b483ccc9 ("bnx2x: Don't flush multicast MACs")
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
Dave,

Please consider applying this to 'net'.

Thanks,
Yuval
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index a851f95..349a465 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12729,7 +12729,7 @@ static int bnx2x_set_mc_list(struct bnx2x *bp)
} else {
/* If no mc addresses are required, flush the configuration */
rc = bnx2x_config_mcast(bp, , BNX2X_MCAST_CMD_DEL);
-   if (rc)
+   if (rc < 0)
BNX2X_ERR("Failed to clear multicast configuration 
%d\n",
  rc);
}
-- 
2.9.4

[PATCH net-next] qede: Fix compilation without QED_RDMA

2017-06-20 Thread Yuval Mintz

From: Chad Dupuis <chad.dup...@cavium.com>

When CONFIG_QED_RDMA isn't defined, we'd hit the following:

 /include/linux/qed/qede_rdma.h:84:19:
 warning: ‘qede_rdma_dev_add’ used but never defined [enabled by default]
 static inline int qede_rdma_dev_add(struct qede_dev *dev);

Fixes: bbfcd1e8e167 ("qed*: Set rdma generic functions prefix")
Signed-off-by: Chad Dupuis <chad.dup...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 include/linux/qed/qede_rdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/qed/qede_rdma.h b/include/linux/qed/qede_rdma.h
index 1348a16..9904617 100644
--- a/include/linux/qed/qede_rdma.h
+++ b/include/linux/qed/qede_rdma.h
@@ -81,7 +81,7 @@ void qede_rdma_dev_remove(struct qede_dev *dev);
 void qede_rdma_event_changeaddr(struct qede_dev *edr);
 
 #else
-static inline int qede_rdma_dev_add(struct qede_dev *dev);
+static inline int qede_rdma_dev_add(struct qede_dev *dev)
 {
return 0;
 }
-- 
2.9.4

[PATCH v4 net-next 7/7] qed: SPQ async callback registration

2017-06-20 Thread Yuval Mintz

From: Michal Kalderon <michal.kalde...@cavium.com>

Whenever firmware indicates that there's an async indication it needs
to handle, there's a switch-case where the right functionality is called
based on function's personality and information.

Before iWARP is added [as yet another client], switch over the SPQ into
a callback-registered mechanism, allowing registration of the relevant
event-processing logic based on the function's personality. This allows
us to tidy the code by removing protocol-specifics from a common file.

Signed-off-by: Michal Kalderon <michal.kalde...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c | 24 -
 drivers/net/ethernet/qlogic/qed/qed_roce.c  | 16 ++---
 drivers/net/ethernet/qlogic/qed/qed_roce.h  |  6 
 drivers/net/ethernet/qlogic/qed/qed_sp.h| 17 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c   | 54 -
 drivers/net/ethernet/qlogic/qed/qed_sriov.c | 16 +++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.h | 18 --
 7 files changed, 96 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c 
b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
index 5a1ed05..813c77c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -62,6 +62,22 @@
 #include "qed_sriov.h"
 #include "qed_reg_addr.h"
 
+static int
+qed_iscsi_async_event(struct qed_hwfn *p_hwfn,
+ u8 fw_event_code,
+ u16 echo, union event_ring_data *data, u8 fw_return_code)
+{
+   if (p_hwfn->p_iscsi_info->event_cb) {
+   struct qed_iscsi_info *p_iscsi = p_hwfn->p_iscsi_info;
+
+   return p_iscsi->event_cb(p_iscsi->event_context,
+fw_event_code, data);
+   } else {
+   DP_NOTICE(p_hwfn, "iSCSI async completion is not set\n");
+   return -EINVAL;
+   }
+}
+
 struct qed_iscsi_conn {
struct list_head list_entry;
bool free_on_delete;
@@ -265,6 +281,9 @@ qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
p_hwfn->p_iscsi_info->event_context = event_context;
p_hwfn->p_iscsi_info->event_cb = async_event_cb;
 
+   qed_spq_register_async_cb(p_hwfn, PROTOCOLID_ISCSI,
+ qed_iscsi_async_event);
+
return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
@@ -631,7 +650,10 @@ static int qed_sp_iscsi_func_stop(struct qed_hwfn *p_hwfn,
p_ramrod = _ent->ramrod.iscsi_destroy;
p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_DESTROY_FUNC;
 
-   return qed_spq_post(p_hwfn, p_ent, NULL);
+   rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+   qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ISCSI);
+   return rc;
 }
 
 static void __iomem *qed_iscsi_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c 
b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 7482905..673f80a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -68,12 +68,14 @@
 
 static void qed_roce_free_real_icid(struct qed_hwfn *p_hwfn, u16 icid);
 
-void qed_roce_async_event(struct qed_hwfn *p_hwfn,
- u8 fw_event_code, union rdma_eqe_data *rdma_data)
+static int
+qed_roce_async_event(struct qed_hwfn *p_hwfn,
+u8 fw_event_code,
+u16 echo, union event_ring_data *data, u8 fw_return_code)
 {
if (fw_event_code == ROCE_ASYNC_EVENT_DESTROY_QP_DONE) {
u16 icid =
-   (u16)le32_to_cpu(rdma_data->rdma_destroy_qp_data.cid);
+   (u16)le32_to_cpu(data->rdma_data.rdma_destroy_qp_data.cid);
 
/* icid release in this async event can occur only if the icid
 * was offloaded to the FW. In case it wasn't offloaded this is
@@ -85,8 +87,10 @@ void qed_roce_async_event(struct qed_hwfn *p_hwfn,
 
events->affiliated_event(p_hwfn->p_rdma_info->events.context,
 fw_event_code,
-_data->async_handle);
+(void *)>rdma_data.async_handle);
}
+
+   return 0;
 }
 
 static int qed_rdma_bmap_alloc(struct qed_hwfn *p_hwfn,
@@ -686,6 +690,9 @@ static int qed_rdma_setup(struct qed_hwfn *p_hwfn,
if (rc)
return rc;
 
+   qed_spq_register_async_cb(p_hwfn, PROTOCOLID_ROCE,
+ qed_roce_async_event);
+
return qed_rdma_start_fw(p_hwfn, params, p_ptt);
 }
 
@@ -706,6 +713,7 @@ void qed_roce_stop(struct qed_hwfn *p_hwfn)
break;
}
}
+   qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ROCE);
 }

[PATCH v4 net-next 6/7] qed: Wait for resources before FUNC_CLOSE

2017-06-20 Thread Yuval Mintz

From: Michal Kalderon <michal.kalde...@cavium.com>

Driver needs to wait for all resources to return from FW before it can send
the FUNC_CLOSE ramrod.

Signed-off-by: Michal Kalderon <michal.kalde...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_roce.c | 35 +-
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c 
b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 8419dcc..7482905 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -372,22 +372,7 @@ static void qed_rdma_bmap_free(struct qed_hwfn *p_hwfn,
 
 static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
 {
-   struct qed_bmap *rcid_map = _hwfn->p_rdma_info->real_cid_map;
struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
-   int wait_count = 0;
-
-   /* when destroying a_RoCE QP the control is returned to the user after
-* the synchronous part. The asynchronous part may take a little longer.
-* We delay for a short while if an async destroy QP is still expected.
-* Beyond the added delay we clear the bitmap anyway.
-*/
-   while (bitmap_weight(rcid_map->bitmap, rcid_map->max_count)) {
-   msleep(100);
-   if (wait_count++ > 20) {
-   DP_NOTICE(p_hwfn, "cid bitmap wait timed out\n");
-   break;
-   }
-   }
 
qed_rdma_bmap_free(p_hwfn, _hwfn->p_rdma_info->cid_map, 1);
qed_rdma_bmap_free(p_hwfn, _hwfn->p_rdma_info->pd_map, 1);
@@ -704,6 +689,25 @@ static int qed_rdma_setup(struct qed_hwfn *p_hwfn,
return qed_rdma_start_fw(p_hwfn, params, p_ptt);
 }
 
+void qed_roce_stop(struct qed_hwfn *p_hwfn)
+{
+   struct qed_bmap *rcid_map = _hwfn->p_rdma_info->real_cid_map;
+   int wait_count = 0;
+
+   /* when destroying a_RoCE QP the control is returned to the user after
+* the synchronous part. The asynchronous part may take a little longer.
+* We delay for a short while if an async destroy QP is still expected.
+* Beyond the added delay we clear the bitmap anyway.
+*/
+   while (bitmap_weight(rcid_map->bitmap, rcid_map->max_count)) {
+   msleep(100);
+   if (wait_count++ > 20) {
+   DP_NOTICE(p_hwfn, "cid bitmap wait timed out\n");
+   break;
+   }
+   }
+}
+
 static int qed_rdma_stop(void *rdma_cxt)
 {
struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
@@ -733,6 +737,7 @@ static int qed_rdma_stop(void *rdma_cxt)
qed_wr(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN,
   (ll2_ethertype_en & 0xFFFE));
 
+   qed_roce_stop(p_hwfn);
qed_ptt_release(p_hwfn, p_ptt);
 
/* Get SPQ entry */
-- 
2.9.4

[PATCH v4 net-next 3/7] qed: Disable RoCE dpm when DCBx change occurs

2017-06-20 Thread Yuval Mintz

If DCBx update occurs while QPs are open, stop sending edpms until all
QPs are closed.

Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c |  8 +++
 drivers/net/ethernet/qlogic/qed/qed_roce.c | 36 ++
 drivers/net/ethernet/qlogic/qed/qed_roce.h |  5 +
 3 files changed, 49 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c 
b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 15b516a..f888045 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -44,6 +44,7 @@
 #include "qed_hsi.h"
 #include "qed_sp.h"
 #include "qed_sriov.h"
+#include "qed_roce.h"
 #ifdef CONFIG_DCB
 #include 
 #endif
@@ -892,6 +893,13 @@ qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
 
/* update storm FW with negotiation results */
qed_sp_pf_update(p_hwfn);
+
+   /* for roce PFs, we may want to enable/disable DPM
+* when DCBx change occurs
+*/
+   if (p_hwfn->hw_info.personality ==
+   QED_PCI_ETH_ROCE)
+   qed_roce_dpm_dcbx(p_hwfn, p_ptt);
}
}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c 
b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 4bc2f6c..8419dcc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -162,6 +162,11 @@ static int qed_bmap_test_id(struct qed_hwfn *p_hwfn,
return test_bit(id_num, bmap->bitmap);
 }
 
+static bool qed_bmap_is_empty(struct qed_bmap *bmap)
+{
+   return bmap->max_count == find_first_bit(bmap->bitmap, bmap->max_count);
+}
+
 static u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id)
 {
/* First sb id for RoCE is after all the l2 sb */
@@ -2638,6 +2643,23 @@ static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
return QED_LEADING_HWFN(cdev);
 }
 
+static bool qed_rdma_allocated_qps(struct qed_hwfn *p_hwfn)
+{
+   bool result;
+
+   /* if rdma info has not been allocated, naturally there are no qps */
+   if (!p_hwfn->p_rdma_info)
+   return false;
+
+   spin_lock_bh(_hwfn->p_rdma_info->lock);
+   if (!p_hwfn->p_rdma_info->cid_map.bitmap)
+   result = false;
+   else
+   result = !qed_bmap_is_empty(_hwfn->p_rdma_info->cid_map);
+   spin_unlock_bh(_hwfn->p_rdma_info->lock);
+   return result;
+}
+
 static void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
u32 val;
@@ -2650,6 +2672,20 @@ static void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
   val, p_hwfn->dcbx_no_edpm, p_hwfn->db_bar_no_edpm);
 }
 
+void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+   u8 val;
+
+   /* if any QPs are already active, we want to disable DPM, since their
+* context information contains information from before the latest DCBx
+* update. Otherwise enable it.
+*/
+   val = qed_rdma_allocated_qps(p_hwfn) ? true : false;
+   p_hwfn->dcbx_no_edpm = (u8)val;
+
+   qed_rdma_dpm_conf(p_hwfn, p_ptt);
+}
+
 void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
p_hwfn->db_bar_no_edpm = true;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h 
b/drivers/net/ethernet/qlogic/qed/qed_roce.h
index 94be3b5..ddd7761 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h
@@ -168,10 +168,15 @@ struct qed_rdma_qp {
 
 #if IS_ENABLED(CONFIG_QED_RDMA)
 void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 void qed_roce_async_event(struct qed_hwfn *p_hwfn,
  u8 fw_event_code, union rdma_eqe_data *rdma_data);
 #else
 static inline void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt 
*p_ptt) {}
+
+static inline void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn,
+struct qed_ptt *p_ptt) {}
+
 static inline void qed_roce_async_event(struct qed_hwfn *p_hwfn,
u8 fw_event_code,
union rdma_eqe_data *rdma_data) {}
-- 
2.9.4

[PATCH v4 net-next 2/7] qed: RoCE EDPM to honor PFC

2017-06-20 Thread Yuval Mintz

Configure device according to DCBx results so that EDPMs
made by RoCE would honor flow-control.

Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 16 
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |  6 ++
 2 files changed, 22 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c 
b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index e2a62c0..15b516a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -896,6 +896,22 @@ qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
}
 
qed_dcbx_get_params(p_hwfn, _hwfn->p_dcbx_info->get, type);
+
+   if (type == QED_DCBX_OPERATIONAL_MIB) {
+   struct qed_dcbx_results *p_data;
+   u16 val;
+
+   /* Configure in NIG which protocols support EDPM and should
+* honor PFC.
+*/
+   p_data = _hwfn->p_dcbx_info->results;
+   val = (0x1 << p_data->arr[DCBX_PROTOCOL_ROCE].tc) |
+ (0x1 << p_data->arr[DCBX_PROTOCOL_ROCE_V2].tc);
+   val <<= NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT;
+   val |= NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN;
+   qed_wr(p_hwfn, p_ptt, NIG_REG_TX_EDPM_CTRL, val);
+   }
+
qed_dcbx_aen(p_hwfn, type);
 
return rc;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h 
b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index 7e4639c..0cdb433 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -1564,6 +1564,12 @@
 #define NIG_REG_TSGEN_FREECNT_UPDATE_K2 0x509008UL
 #define CNIG_REG_NIG_PORT0_CONF_K2 0x218200UL
 
+#define NIG_REG_TX_EDPM_CTRL 0x501f0cUL
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN (0x1 << 0)
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN_SHIFT 0
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN (0xff << 1)
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT 1
+
 #define PRS_REG_SEARCH_GFT 0x1f11bcUL
 #define PRS_REG_CM_HDR_GFT 0x1f11c8UL
 #define PRS_REG_GFT_CAM 0x1f1100UL
-- 
2.9.4

[PATCH v4 net-next 4/7] qed*: qede_roce.[ch] -> qede_rdma.[ch]

2017-06-20 Thread Yuval Mintz

From: Michal Kalderon <michal.kalde...@cavium.com>

Once we have iWARP support, the qede portion of the qedr<->qede would
serve all the RDMA protocols - so rename the file to be appropriate
to its function.

While we're at it, we're also moving a couple of inclusions to it into
.h files and adding includes to make sure it contains all type
definitions it requires.

Signed-off-by: Michal Kalderon <michal.kalde...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/infiniband/hw/qedr/main.c | 2 +-
 drivers/infiniband/hw/qedr/qedr.h | 2 +-
 drivers/net/ethernet/qlogic/qede/Makefile | 2 +-
 drivers/net/ethernet/qlogic/qede/qede.h   | 1 +
 drivers/net/ethernet/qlogic/qede/qede_main.c  | 1 -
 drivers/net/ethernet/qlogic/qede/{qede_roce.c => qede_rdma.c} | 2 +-
 include/linux/qed/{qede_roce.h => qede_rdma.h}| 5 +
 7 files changed, 10 insertions(+), 5 deletions(-)
 rename drivers/net/ethernet/qlogic/qede/{qede_roce.c => qede_rdma.c} (99%)
 rename include/linux/qed/{qede_roce.h => qede_rdma.h} (96%)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index 5a32b80..714eb0c 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+
 #include 
 #include 
 #include "qedr.h"
diff --git a/drivers/infiniband/hw/qedr/qedr.h 
b/drivers/infiniband/hw/qedr/qedr.h
index 80333ec..2376019 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include "qedr_hsi_rdma.h"
 
diff --git a/drivers/net/ethernet/qlogic/qede/Makefile 
b/drivers/net/ethernet/qlogic/qede/Makefile
index bc5f7c3..75408fb 100644
--- a/drivers/net/ethernet/qlogic/qede/Makefile
+++ b/drivers/net/ethernet/qlogic/qede/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_QEDE) := qede.o
 
 qede-y := qede_main.o qede_fp.o qede_filter.o qede_ethtool.o qede_ptp.o
 qede-$(CONFIG_DCB) += qede_dcbnl.o
-qede-$(CONFIG_QED_RDMA) += qede_roce.o
+qede-$(CONFIG_QED_RDMA) += qede_rdma.o
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index 694c09b..2d6b30c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #ifdef CONFIG_RFS_ACCEL
 #include 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 37ad799..e9eaa38 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -60,7 +60,6 @@
 #include 
 #include 
 #include 
-#include 
 #include "qede.h"
 #include "qede_ptp.h"
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_roce.c 
b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
similarity index 99%
rename from drivers/net/ethernet/qlogic/qede/qede_roce.c
rename to drivers/net/ethernet/qlogic/qede/qede_rdma.c
index c0030fb..9837ee2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_roce.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
@@ -33,7 +33,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include "qede.h"
 
 static struct qedr_driver *qedr_drv;
diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_rdma.h
similarity index 96%
rename from include/linux/qed/qede_roce.h
rename to include/linux/qed/qede_rdma.h
index 3b8dd55..a1a9b81 100644
--- a/include/linux/qed/qede_roce.h
+++ b/include/linux/qed/qede_rdma.h
@@ -32,6 +32,11 @@
 #ifndef QEDE_ROCE_H
 #define QEDE_ROCE_H
 
+#include 
+#include 
+#include 
+#include 
+
 struct qedr_dev;
 struct qed_dev;
 struct qede_dev;
-- 
2.9.4

[PATCH v4 net-next 5/7] qed*: Set rdma generic functions prefix

2017-06-20 Thread Yuval Mintz

From: Michal Kalderon <michal.kalde...@cavium.com>

Rename the functions common to both iWARP and RoCE to have a prefix of
_rdma_ instead of _roce_.

Signed-off-by: Michal Kalderon <michal.kalde...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/infiniband/hw/qedr/main.c|   6 +-
 drivers/net/ethernet/qlogic/qede/qede.h  |   4 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c |  12 +--
 drivers/net/ethernet/qlogic/qede/qede_rdma.c | 142 +--
 include/linux/qed/qede_rdma.h|  37 +++
 5 files changed, 101 insertions(+), 100 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index 714eb0c..b5851fd 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -902,7 +902,7 @@ static void qedr_mac_address_change(struct qedr_dev *dev)
  * initialization done before RoCE driver notifies
  * event to stack.
  */
-static void qedr_notify(struct qedr_dev *dev, enum qede_roce_event event)
+static void qedr_notify(struct qedr_dev *dev, enum qede_rdma_event event)
 {
switch (event) {
case QEDE_UP:
@@ -931,12 +931,12 @@ static struct qedr_driver qedr_drv = {
 
 static int __init qedr_init_module(void)
 {
-   return qede_roce_register_driver(_drv);
+   return qede_rdma_register_driver(_drv);
 }
 
 static void __exit qedr_exit_module(void)
 {
-   qede_roce_unregister_driver(_drv);
+   qede_rdma_unregister_driver(_drv);
 }
 
 module_init(qedr_init_module);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index 2d6b30c..4dfb238 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -154,8 +154,8 @@ struct qede_vlan {
 struct qede_rdma_dev {
struct qedr_dev *qedr_dev;
struct list_head entry;
-   struct list_head roce_event_list;
-   struct workqueue_struct *roce_wq;
+   struct list_head rdma_event_list;
+   struct workqueue_struct *rdma_wq;
 };
 
 struct qede_ptp;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index e9eaa38..06ca13d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -262,7 +262,7 @@ static int qede_netdev_event(struct notifier_block *this, 
unsigned long event,
break;
case NETDEV_CHANGEADDR:
edev = netdev_priv(ndev);
-   qede_roce_event_changeaddr(edev);
+   qede_rdma_event_changeaddr(edev);
break;
}
 
@@ -977,7 +977,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 
dp_module, u8 dp_level,
 
qede_init_ndev(edev);
 
-   rc = qede_roce_dev_add(edev);
+   rc = qede_rdma_dev_add(edev);
if (rc)
goto err3;
 
@@ -1013,7 +1013,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 
dp_module, u8 dp_level,
return 0;
 
 err4:
-   qede_roce_dev_remove(edev);
+   qede_rdma_dev_remove(edev);
 err3:
free_netdev(edev->ndev);
 err2:
@@ -1064,7 +1064,7 @@ static void __qede_remove(struct pci_dev *pdev, enum 
qede_remove_mode mode)
 
qede_ptp_disable(edev);
 
-   qede_roce_dev_remove(edev);
+   qede_rdma_dev_remove(edev);
 
edev->ops->common->set_power_state(cdev, PCI_D0);
 
@@ -1964,7 +1964,7 @@ static void qede_unload(struct qede_dev *edev, enum 
qede_unload_mode mode,
 
edev->state = QEDE_STATE_CLOSED;
 
-   qede_roce_dev_event_close(edev);
+   qede_rdma_dev_event_close(edev);
 
/* Close OS Tx */
netif_tx_disable(edev->ndev);
@@ -2069,7 +2069,7 @@ static int qede_load(struct qede_dev *edev, enum 
qede_load_mode mode,
link_params.link_up = true;
edev->ops->common->set_link(edev->cdev, _params);
 
-   qede_roce_dev_event_open(edev);
+   qede_rdma_dev_event_open(edev);
 
edev->state = QEDE_STATE_OPEN;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c 
b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
index 9837ee2..50b142f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_rdma.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
@@ -40,12 +40,12 @@ static struct qedr_driver *qedr_drv;
 static LIST_HEAD(qedr_dev_list);
 static DEFINE_MUTEX(qedr_dev_list_lock);
 
-bool qede_roce_supported(struct qede_dev *dev)
+bool qede_rdma_supported(struct qede_dev *dev)
 {
return dev->dev_info.common.rdma_supported;
 }
 
-static void _qede_roce_dev_add(struct qede_dev *edev)
+static void _qede_rdma_dev_add(struct qede_dev *edev)
 {
if (!qedr_drv)
return;
@@ -54,11 +54,11 @@ static void _qede_roce_dev_add(struct qede_dev *edev)
 edev->ndev);
 }
 
-static int qede_roce_create_wq(struct qede_dev *edev)
+static int qede_rdma_c

[PATCH v4 net-next 1/7] qed: Chain support for external PBL

2017-06-20 Thread Yuval Mintz

iWARP would require the chains to allocate/free their PBL memory
independently, so add the infrastructure to provide it externally.

Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/infiniband/hw/qedr/main.c |  2 +-
 drivers/infiniband/hw/qedr/verbs.c|  6 ++---
 drivers/net/ethernet/qlogic/qed/qed_dev.c | 35 ---
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h |  5 +++-
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c   |  6 ++---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c |  6 ++---
 drivers/net/ethernet/qlogic/qed/qed_spq.c |  6 ++---
 drivers/net/ethernet/qlogic/qede/qede_main.c  |  8 +++---
 include/linux/qed/qed_chain.h |  7 ++
 include/linux/qed/qed_if.h|  3 ++-
 10 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index 485c1fe..5a32b80 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -276,7 +276,7 @@ static int qedr_alloc_resources(struct qedr_dev *dev)
   QED_CHAIN_CNT_TYPE_U16,
   n_entries,
   sizeof(struct regpair *),
-  >pbl);
+  >pbl, NULL);
if (rc)
goto err4;
 
diff --git a/drivers/infiniband/hw/qedr/verbs.c 
b/drivers/infiniband/hw/qedr/verbs.c
index 17685cf..80df89b 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -925,7 +925,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
   QED_CHAIN_CNT_TYPE_U32,
   chain_entries,
   sizeof(union rdma_cqe),
-  >pbl);
+  >pbl, NULL);
if (rc)
goto err1;
 
@@ -1413,7 +1413,7 @@ qedr_roce_create_kernel_qp(struct qedr_dev *dev,
   QED_CHAIN_CNT_TYPE_U32,
   n_sq_elems,
   QEDR_SQE_ELEMENT_SIZE,
-  >sq.pbl);
+  >sq.pbl, NULL);
 
if (rc)
return rc;
@@ -1427,7 +1427,7 @@ qedr_roce_create_kernel_qp(struct qedr_dev *dev,
   QED_CHAIN_CNT_TYPE_U32,
   n_rq_elems,
   QEDR_RQE_ELEMENT_SIZE,
-  >rq.pbl);
+  >rq.pbl, NULL);
if (rc)
return rc;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c 
b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 65fe494..8b14054 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -3075,12 +3075,15 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, 
struct qed_chain *p_chain)
}
 
pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
-   dma_free_coherent(>pdev->dev,
- pbl_size,
- p_chain->pbl_sp.p_virt_table,
- p_chain->pbl_sp.p_phys_table);
+
+   if (!p_chain->b_external_pbl)
+   dma_free_coherent(>pdev->dev,
+ pbl_size,
+ p_chain->pbl_sp.p_virt_table,
+ p_chain->pbl_sp.p_phys_table);
 out:
vfree(p_chain->pbl.pp_virt_addr_tbl);
+   p_chain->pbl.pp_virt_addr_tbl = NULL;
 }
 
 void qed_chain_free(struct qed_dev *cdev, struct qed_chain *p_chain)
@@ -3174,7 +3177,10 @@ qed_chain_alloc_single(struct qed_dev *cdev, struct 
qed_chain *p_chain)
return 0;
 }
 
-static int qed_chain_alloc_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
+static int
+qed_chain_alloc_pbl(struct qed_dev *cdev,
+   struct qed_chain *p_chain,
+   struct qed_chain_ext_pbl *ext_pbl)
 {
u32 page_cnt = p_chain->page_cnt, size, i;
dma_addr_t p_phys = 0, p_pbl_phys = 0;
@@ -3194,8 +3200,16 @@ static int qed_chain_alloc_pbl(struct qed_dev *cdev, 
struct qed_chain *p_chain)
 * should be saved to allow its freeing during the error flow.
 */
size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
-   p_pbl_virt = dma_alloc_coherent(>pdev->dev,
-   size, _pbl_phys, GFP_KERNEL);
+
+   if (!ext_pbl) {
+   p_pbl_virt = dma_alloc_coherent(>

[PATCH v4 net-next 0/7] qed*: RDMA and infrastructure for iWARP

2017-06-20 Thread Yuval Mintz

This series focuses on RDMA in general with emphasis on required changes
toward adding iWARP support. The vast majority of the changes introduced
are in qed/qede, with a couple of small changes to qedr
[mentioned below].

The infrastructure changes:
 - Patch #1 adds the ability to pass PBL memory externally for a newly
created chain.
 - Patches #4, #5 rename qede_roce.[ch] into qede_rdma.[ch] + change
prefixes from _roce_ to _rdma_, as the API between qede and qedr is
agnostic to the variant of the RDMA protocol used. These patches also
touch qedr [basically to align it with the renaming, nothing more].
 - Patch #7 replaces the current SPQ async mechanism into serving
registered callbacks [before adding iWARP which would add another client
in need of this sort of functionallity].

The non-infrastrucutre changes:
 - Patches #2, #3 contain DCB-related changes to better align RDMA with
configured DCB.
 - Patch #6 contains a minor [mostly theoretical fix] to release flow.

Dave,

Please consider applying this series to `net-next'.

Thanks,
Yuval

Changes from previous versions
--
 - V4: This is actually a repost of V3 due to some confusion regarding
   the sent cover-letter
 - V3: Add commit log message in #4 indicating change in header inclusion
 - V2: Add several inclusion into qede_rdma.h to have proper declarations
   of all variable types used in it

Michal Kalderon (3):
  qed*: qede_roce.[ch] -> qede_rdma.[ch]
  qed*: Set rdma generic functions prefix
  qed: Wait for resources before FUNC_CLOSE
  qed: SPQ async callback registration

Yuval Mintz (4):
  qed: Chain support for external PBL
  qed: RoCE EDPM to honor PFC
  qed: Disable RoCE dpm when DCBx change occurs

 drivers/infiniband/hw/qedr/main.c  |  10 +-
 drivers/infiniband/hw/qedr/qedr.h  |   2 +-
 drivers/infiniband/hw/qedr/verbs.c |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c |  24 
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |  35 +++--
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h  |   5 +-
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c|  30 -
 drivers/net/ethernet/qlogic/qed/qed_ll2.c  |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |   6 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c |  87 ++---
 drivers/net/ethernet/qlogic/qed/qed_roce.h |   9 +-
 drivers/net/ethernet/qlogic/qed/qed_sp.h   |  17 +++
 drivers/net/ethernet/qlogic/qed/qed_spq.c  |  60 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c|  16 ++-
 drivers/net/ethernet/qlogic/qed/qed_sriov.h|  18 ---
 drivers/net/ethernet/qlogic/qede/Makefile  |   2 +-
 drivers/net/ethernet/qlogic/qede/qede.h|   5 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  21 ++-
 .../qlogic/qede/{qede_roce.c => qede_rdma.c}   | 144 ++---
 include/linux/qed/qed_chain.h  |   7 +
 include/linux/qed/qed_if.h |   3 +-
 include/linux/qed/{qede_roce.h => qede_rdma.h} |  42 +++---
 22 files changed, 353 insertions(+), 202 deletions(-)
 rename drivers/net/ethernet/qlogic/qede/{qede_roce.c => qede_rdma.c} (59%)
 rename include/linux/qed/{qede_roce.h => qede_rdma.h} (65%)

-- 
2.9.4

[PATCH v3 net-next 5/7] qed*: Set rdma generic functions prefix

2017-06-19 Thread Yuval Mintz

From: Michal Kalderon <michal.kalde...@cavium.com>

Rename the functions common to both iWARP and RoCE to have a prefix of
_rdma_ instead of _roce_.

Signed-off-by: Michal Kalderon <michal.kalde...@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mi...@cavium.com>
---
 drivers/infiniband/hw/qedr/main.c|   6 +-
 drivers/net/ethernet/qlogic/qede/qede.h  |   4 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c |  12 +--
 drivers/net/ethernet/qlogic/qede/qede_rdma.c | 142 +--
 include/linux/qed/qede_rdma.h|  37 +++
 5 files changed, 101 insertions(+), 100 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index 714eb0c..b5851fd 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -902,7 +902,7 @@ static void qedr_mac_address_change(struct qedr_dev *dev)
  * initialization done before RoCE driver notifies
  * event to stack.
  */
-static void qedr_notify(struct qedr_dev *dev, enum qede_roce_event event)
+static void qedr_notify(struct qedr_dev *dev, enum qede_rdma_event event)
 {
switch (event) {
case QEDE_UP:
@@ -931,12 +931,12 @@ static struct qedr_driver qedr_drv = {
 
 static int __init qedr_init_module(void)
 {
-   return qede_roce_register_driver(_drv);
+   return qede_rdma_register_driver(_drv);
 }
 
 static void __exit qedr_exit_module(void)
 {
-   qede_roce_unregister_driver(_drv);
+   qede_rdma_unregister_driver(_drv);
 }
 
 module_init(qedr_init_module);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index 2d6b30c..4dfb238 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -154,8 +154,8 @@ struct qede_vlan {
 struct qede_rdma_dev {
struct qedr_dev *qedr_dev;
struct list_head entry;
-   struct list_head roce_event_list;
-   struct workqueue_struct *roce_wq;
+   struct list_head rdma_event_list;
+   struct workqueue_struct *rdma_wq;
 };
 
 struct qede_ptp;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index e9eaa38..06ca13d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -262,7 +262,7 @@ static int qede_netdev_event(struct notifier_block *this, 
unsigned long event,
break;
case NETDEV_CHANGEADDR:
edev = netdev_priv(ndev);
-   qede_roce_event_changeaddr(edev);
+   qede_rdma_event_changeaddr(edev);
break;
}
 
@@ -977,7 +977,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 
dp_module, u8 dp_level,
 
qede_init_ndev(edev);
 
-   rc = qede_roce_dev_add(edev);
+   rc = qede_rdma_dev_add(edev);
if (rc)
goto err3;
 
@@ -1013,7 +1013,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 
dp_module, u8 dp_level,
return 0;
 
 err4:
-   qede_roce_dev_remove(edev);
+   qede_rdma_dev_remove(edev);
 err3:
free_netdev(edev->ndev);
 err2:
@@ -1064,7 +1064,7 @@ static void __qede_remove(struct pci_dev *pdev, enum 
qede_remove_mode mode)
 
qede_ptp_disable(edev);
 
-   qede_roce_dev_remove(edev);
+   qede_rdma_dev_remove(edev);
 
edev->ops->common->set_power_state(cdev, PCI_D0);
 
@@ -1964,7 +1964,7 @@ static void qede_unload(struct qede_dev *edev, enum 
qede_unload_mode mode,
 
edev->state = QEDE_STATE_CLOSED;
 
-   qede_roce_dev_event_close(edev);
+   qede_rdma_dev_event_close(edev);
 
/* Close OS Tx */
netif_tx_disable(edev->ndev);
@@ -2069,7 +2069,7 @@ static int qede_load(struct qede_dev *edev, enum 
qede_load_mode mode,
link_params.link_up = true;
edev->ops->common->set_link(edev->cdev, _params);
 
-   qede_roce_dev_event_open(edev);
+   qede_rdma_dev_event_open(edev);
 
edev->state = QEDE_STATE_OPEN;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c 
b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
index 9837ee2..50b142f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_rdma.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
@@ -40,12 +40,12 @@ static struct qedr_driver *qedr_drv;
 static LIST_HEAD(qedr_dev_list);
 static DEFINE_MUTEX(qedr_dev_list_lock);
 
-bool qede_roce_supported(struct qede_dev *dev)
+bool qede_rdma_supported(struct qede_dev *dev)
 {
return dev->dev_info.common.rdma_supported;
 }
 
-static void _qede_roce_dev_add(struct qede_dev *edev)
+static void _qede_rdma_dev_add(struct qede_dev *edev)
 {
if (!qedr_drv)
return;
@@ -54,11 +54,11 @@ static void _qede_roce_dev_add(struct qede_dev *edev)
 edev->ndev);
 }
 
-static int qede_roce_create_wq(struct qede_dev *edev)
+static int qede_rdma_c

1 2 3 4 5 6 7 8 9 >

1 - 100 of 854 matches

Mail list logo