[PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout

2018-05-29 Thread Prashant Bhole
In order to reduce runtime of tests, recently timout for select() call
was reduced from 1sec to 10usec. This was causing many tests failures.
It was caught with failure handling commits in this series.

Restoring the timeout from 10usec to 1sec

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 64f9e25c451f..9d01f5c2abe2 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
if (err < 0)
perror("recv start time: ");
while (s->bytes_recvd < total_bytes) {
-   timeout.tv_sec = 0;
-   timeout.tv_usec = 10;
+   timeout.tv_sec = 1;
+   timeout.tv_usec = 0;
 
/* FD sets */
FD_ZERO();
-- 
2.17.0




[PATCH bpf v3 4/5] selftests/bpf: test_sockmap, fix data verification

2018-05-29 Thread Prashant Bhole
When data verification is enabled, some tests fail because verification is done
incorrectly. Following changes fix it.

- Identify the size of data block to be verified
- Reset verification counter when data block size is reached
- Fixed the value printed in case of verfication failure

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 9d01f5c2abe2..664f268dc02a 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -337,8 +337,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
int fd_flags = O_NONBLOCK;
struct timeval timeout;
float total_bytes;
+   int bytes_cnt = 0;
+   int chunk_sz;
fd_set w;
 
+   if (opt->sendpage)
+   chunk_sz = iov_length * cnt;
+   else
+   chunk_sz = iov_length * iov_count;
+
fcntl(fd, fd_flags);
total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
err = clock_gettime(CLOCK_MONOTONIC, >start);
@@ -388,9 +395,14 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
errno = -EIO;
fprintf(stderr,
"detected data 
corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-   i, j, d[j], k - 
1, d[j+1], k + 1);
+   i, j, d[j], k - 
1, d[j+1], k);
goto out_errno;
}
+   bytes_cnt++;
+   if (bytes_cnt == chunk_sz) {
+   k = 0;
+   bytes_cnt = 0;
+   }
recv--;
}
}
-- 
2.17.0




[PATCH bpf v3 5/5] selftests/bpf: test_sockmap, print additional test options

2018-05-29 Thread Prashant Bhole
Print values of test options like apply, cork, start, end so that
individual failed tests can be identified for manual run

Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 28 +++---
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 664f268dc02a..637c6585ff80 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -869,6 +869,8 @@ static char *test_to_str(int test)
 #define OPTSTRING 60
 static void test_options(char *options)
 {
+   char tstr[OPTSTRING];
+
memset(options, 0, OPTSTRING);
 
if (txmsg_pass)
@@ -881,14 +883,22 @@ static void test_options(char *options)
strncat(options, "redir_noisy,", OPTSTRING);
if (txmsg_drop)
strncat(options, "drop,", OPTSTRING);
-   if (txmsg_apply)
-   strncat(options, "apply,", OPTSTRING);
-   if (txmsg_cork)
-   strncat(options, "cork,", OPTSTRING);
-   if (txmsg_start)
-   strncat(options, "start,", OPTSTRING);
-   if (txmsg_end)
-   strncat(options, "end,", OPTSTRING);
+   if (txmsg_apply) {
+   snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_cork) {
+   snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_start) {
+   snprintf(tstr, OPTSTRING, "start %d,", txmsg_start);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_end) {
+   snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
+   strncat(options, tstr, OPTSTRING);
+   }
if (txmsg_ingress)
strncat(options, "ingress,", OPTSTRING);
if (txmsg_skb)
@@ -897,7 +907,7 @@ static void test_options(char *options)
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
 {
-   char *options = calloc(60, sizeof(char));
+   char *options = calloc(OPTSTRING, sizeof(char));
int err;
 
if (test == SENDPAGE)
-- 
2.17.0




[PATCH bpf v3 2/5] selftests/bpf: test_sockmap, join cgroup in selftest mode

2018-05-29 Thread Prashant Bhole
In case of selftest mode, temporary cgroup environment is created but
cgroup is not joined. It causes test failures. Fixed by joining the
cgroup

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 01bc9c6745e8..64f9e25c451f 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1342,6 +1342,11 @@ static int __test_suite(char *bpf_file)
return cg_fd;
}
 
+   if (join_cgroup(CG_PATH)) {
+   fprintf(stderr, "ERROR: failed to join cgroup\n");
+   return -EINVAL;
+   }
+
/* Tests basic commands and APIs with range of iov values */
txmsg_start = txmsg_end = 0;
err = test_txmsg(cg_fd);
-- 
2.17.0




[PATCH bpf v3 0/5] fix test_sockmap

2018-05-29 Thread Prashant Bhole
This series fixes error handling, timeout and data verification in
test_sockmap. Previously it was not able to detect failure/timeout in
RX/TX thread because error was not notified to the main thread.

Also slightly improved test output by printing parameter values (cork,
apply, start, end) so that parameters for all tests are displayed.

Changes in v3:
  - Skipped error checking for corked tests

Prashant Bhole (5):
  selftests/bpf: test_sockmap, check test failure
  selftests/bpf: test_sockmap, join cgroup in selftest mode
  selftests/bpf: test_sockmap, fix test timeout
  selftests/bpf: test_sockmap, fix data verification
  selftests/bpf: test_sockmap, print additional test options

 tools/testing/selftests/bpf/test_sockmap.c | 76 +-
 1 file changed, 58 insertions(+), 18 deletions(-)

-- 
2.17.0




[PATCH bpf v3 1/5] selftests/bpf: test_sockmap, check test failure

2018-05-29 Thread Prashant Bhole
Test failures are not identified because exit code of RX/TX threads
is not checked. Also threads are not returning correct exit code.

- Return exit code from threads depending on test execution status
- In main thread, check the exit code of RX/TX threads
- Skip error checking for corked tests as they are expected to timeout

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 25 --
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index eb17fae458e6..01bc9c6745e8 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
struct msg_stats s = {0};
int iov_count = opt->iov_count;
int iov_buf = opt->iov_length;
+   int rx_status, tx_status;
int cnt = opt->rate;
-   int status;
 
errno = 0;
 
@@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
rxpid = fork();
if (rxpid == 0) {
if (opt->drop_expected)
-   exit(1);
+   exit(0);
 
if (opt->sendpage)
iov_count = 1;
@@ -463,7 +463,7 @@ static int sendmsg_test(struct sockmap_options *opt)
"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB 
%fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-   exit(1);
+   exit(err ? 1 : 0);
} else if (rxpid == -1) {
perror("msg_loop_rx: ");
return errno;
@@ -491,14 +491,27 @@ static int sendmsg_test(struct sockmap_options *opt)
"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB 
%fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-   exit(1);
+   exit(err ? 1 : 0);
} else if (txpid == -1) {
perror("msg_loop_tx: ");
return errno;
}
 
-   assert(waitpid(rxpid, , 0) == rxpid);
-   assert(waitpid(txpid, , 0) == txpid);
+   assert(waitpid(rxpid, _status, 0) == rxpid);
+   assert(waitpid(txpid, _status, 0) == txpid);
+   if (WIFEXITED(rx_status)) {
+   err = WEXITSTATUS(rx_status);
+   if (err && !txmsg_cork) {
+   fprintf(stderr, "rx thread exited with err %d. ", err);
+   goto out;
+   }
+   }
+   if (WIFEXITED(tx_status)) {
+   err = WEXITSTATUS(tx_status);
+   if (err)
+   fprintf(stderr, "tx thread exited with err %d. ", err);
+   }
+out:
return err;
 }
 
-- 
2.17.0




Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-29 Thread Jakub Kicinski
On Tue, 29 May 2018 20:19:54 -0700, Michael Chan wrote:
> On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar wrote:
> > Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
> > extended?

+1 it's painful to see this feature being added to the legacy
API :(  Another duplicated configuration knob.

> I didn't know about that.
>
> > Shouldn't we enable this via ethtool on the port representor netdev?
>
> We discussed about this.  ethtool on the VF representor will only work
> in switchdev mode and also will not support min/max values.

Ethtool channel API may be overdue a rewrite in devlink anyway, but I
feel like implementing switchdev mode and rewriting features in devlink
may be too much to ask.


[PATCH v5 net] stmmac: 802.1ad tag stripping fix

2018-05-29 Thread Elad Nachman
stmmac reception handler calls stmmac_rx_vlan() to strip the vlan before 
calling napi_gro_receive().

The function assumes VLAN tagged frames are always tagged with 802.1Q protocol,
and assigns ETH_P_8021Q to the skb by hard-coding the parameter on call to 
__vlan_hwaccel_put_tag() .

This causes packets not to be passed to the VLAN slave if it was created with 
802.1AD protocol
(ip link add link eth0 eth0.100 type vlan proto 802.1ad id 100).

This fix passes the protocol from the VLAN header into __vlan_hwaccel_put_tag()
instead of using the hard-coded value of ETH_P_8021Q.
NETIF_F_HW_VLAN_CTAG_RX check was removed and NETIF_F_HW_VLAN_STAG_RX feature 
was added to be in line with the driver actual abilities.

Signed-off-by: Elad Nachman 

---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b65e2d1..f680bcf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3293,17 +3293,17 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, 
struct net_device *dev)
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-   struct ethhdr *ehdr;
+   struct vlan_ethhdr *veth;
u16 vlanid;
+   __be16 vlan_proto;
 
-   if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
-   NETIF_F_HW_VLAN_CTAG_RX &&
-   !__vlan_get_tag(skb, )) {
+   if (!__vlan_get_tag(skb, )) {
/* pop the vlan tag */
-   ehdr = (struct ethhdr *)skb->data;
-   memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
+   veth = (struct vlan_ethhdr *)skb->data;
+   vlan_proto = veth->h_vlan_proto;
+   memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2);
skb_pull(skb, VLAN_HLEN);
-   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+   __vlan_hwaccel_put_tag(skb, vlan_proto, vlanid);
}
 }
 
@@ -4344,7 +4344,7 @@ int stmmac_dvr_probe(struct device *device,
ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
 #ifdef STMMAC_VLAN_TAG_USED
/* Both mac100 and gmac support receive VLAN tag detection */
-   ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+   ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX;
 #endif
priv->msg_enable = netif_msg_init(debug, default_msg_level);
 
-- 
2.7.4


Re: [PATCH bpf v2 0/5] fix test_sockmap

2018-05-29 Thread Prashant Bhole




On 5/30/2018 2:12 PM, John Fastabend wrote:

On 05/29/2018 05:44 PM, Prashant Bhole wrote:



On 5/30/2018 12:48 AM, John Fastabend wrote:

On 05/27/2018 09:37 PM, Prashant Bhole wrote:

This series fixes error handling, timeout and data verification in
test_sockmap. Previously it was not able to detect failure/timeout in
RX/TX thread because error was not notified to the main thread.

Also slightly improved test output by printing parameter values (cork,
apply, start, end) so that parameters for all tests are displayed.

Prashant Bhole (5):
    selftests/bpf: test_sockmap, check test failure
    selftests/bpf: test_sockmap, join cgroup in selftest mode
    selftests/bpf: test_sockmap, fix test timeout
    selftests/bpf: test_sockmap, fix data verification
    selftests/bpf: test_sockmap, print additional test options

   tools/testing/selftests/bpf/test_sockmap.c | 76 +-
   1 file changed, 58 insertions(+), 18 deletions(-)



After first patch "check test failure" how do we handle the case
where test is known to cause timeouts because we are specifically testing
these cases. This is the 'cork' parameter we discussed in the last
series. It looks like with this series the test may still throw an
error?


Sorry. In your comment in last series, did you mean to skip error
checking only for all cork tests (for now)?

-Prashant



Hi, After this is applied are any errors returned from test_sockmap?
When I read the first patch it looked like timeouts from the cork
tests may result in errors "FAILED" tests. If this is the case then
yes we need skip error checking on all tests or just the corked
tests.


Yes errors returned after applying this series. I will skip error 
checking on just corked tests.


-Prashant



Re: [PATCH bpf v2 0/5] fix test_sockmap

2018-05-29 Thread John Fastabend
On 05/29/2018 05:44 PM, Prashant Bhole wrote:
> 
> 
> On 5/30/2018 12:48 AM, John Fastabend wrote:
>> On 05/27/2018 09:37 PM, Prashant Bhole wrote:
>>> This series fixes error handling, timeout and data verification in
>>> test_sockmap. Previously it was not able to detect failure/timeout in
>>> RX/TX thread because error was not notified to the main thread.
>>>
>>> Also slightly improved test output by printing parameter values (cork,
>>> apply, start, end) so that parameters for all tests are displayed.
>>>
>>> Prashant Bhole (5):
>>>    selftests/bpf: test_sockmap, check test failure
>>>    selftests/bpf: test_sockmap, join cgroup in selftest mode
>>>    selftests/bpf: test_sockmap, fix test timeout
>>>    selftests/bpf: test_sockmap, fix data verification
>>>    selftests/bpf: test_sockmap, print additional test options
>>>
>>>   tools/testing/selftests/bpf/test_sockmap.c | 76 +-
>>>   1 file changed, 58 insertions(+), 18 deletions(-)
>>>
>>
>> After first patch "check test failure" how do we handle the case
>> where test is known to cause timeouts because we are specifically testing
>> these cases. This is the 'cork' parameter we discussed in the last
>> series. It looks like with this series the test may still throw an
>> error?
> 
> Sorry. In your comment in last series, did you mean to skip error
> checking only for all cork tests (for now)?
> 
> -Prashant
> 

Hi, After this is applied are any errors returned from test_sockmap?
When I read the first patch it looked like timeouts from the cork
tests may result in errors "FAILED" tests. If this is the case then
yes we need skip error checking on all tests or just the corked
tests.

.John


[PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-29 Thread Eric Dumazet
Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.

Note that mlx4_alloc_icm() is already able to try high order allocations
and fallback to low-order allocations under high memory pressure.

We only have to tweak gfp_mask a bit, to help falling back faster,
without risking OOM killings.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr 8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G   O
Call Trace:
 [] dump_stack+0x4d/0x72
 [] print_address_description+0x6f/0x260
 [] kasan_report+0x257/0x370
 [] __asan_report_load4_noabort+0x19/0x20
 [] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
 [] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
 [] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
 [] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
 [] seq_read+0xa9c/0x1230
 [] proc_reg_read+0xc1/0x180
 [] __vfs_read+0xe8/0x730
 [] vfs_read+0xf7/0x300
 [] SyS_read+0xd2/0x1b0
 [] do_syscall_64+0x186/0x420
 [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:7ffd09a758c0 EFLAGS: 0293 ORIG_RAX: 
RAX: ffda RBX: 7f84ff959440 RCX: 7f851a7bb30d
RDX: 0003fc00 RSI: 7f84ff60a000 RDI: 000b
RBP: 7ffd09a75900 R08:  R09: 
R10: 0022 R11: 0293 R12: 
R13: 0003 R14: 0003 R15: 7f84ff60a000

Allocated by task 4488:
 save_stack+0x46/0xd0
 kasan_kmalloc+0xad/0xe0
 __kmalloc+0x101/0x5e0
 ib_register_device+0xc03/0x1250 [ib_core]
 mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
 mlx4_add_device+0xa9/0x340 [mlx4_core]
 mlx4_register_interface+0x16e/0x390 [mlx4_core]
 xhci_pci_remove+0x7a/0x180 [xhci_pci]
 do_one_initcall+0xa0/0x230
 do_init_module+0x1b9/0x5a4
 load_module+0x63e6/0x94c0
 SYSC_init_module+0x1a4/0x1c0
 SyS_init_module+0xe/0x10
 do_syscall_64+0x186/0x420
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at 8817df584f40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
 32-byte region [8817df584f40, 8817df584f60)
The buggy address belongs to the page:
page:ea005f7d6100 count:1 mapcount:0 mapping:8817df584000 
index:0x8817df584fc1
flags: 0x8800100(slab)
raw: 08800100 8817df584000 8817df584fc1 0001003f
raw: ea005f3ac0a0 ea005c476760 8817fec00900 883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:883ff78d26c0

Memory state around the buggy address:
 8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
 8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc
> 8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
  ^
 8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
Signed-off-by: Eric Dumazet 
Cc: John Sperbeck 
Cc: Tarick Bedeir 
Cc: Qing Huang 
Cc: Daniel Jurgens 
Cc: Zhu Yanjun 
Cc: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx4/icm.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 
685337d58276fc91baeeb64387c52985e1bc6dda..cae33d5c7dbd9ba7929adcf2127b104f6796fa5a
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
 #include "fw.h"
 
 /*
- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
  */
 enum {
-   MLX4_ICM_ALLOC_SIZE = PAGE_SIZE,
-   MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
+   MLX4_ICM_ALLOC_SIZE = 1 << 18,
+   MLX4_TABLE_CHUNK_SIZE   = 1 << 18,
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk 
*chunk)
@@ -135,6 +136,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int 
npages,
struct mlx4_icm *icm;
struct mlx4_icm_chunk *chunk = NULL;
int cur_order;
+   gfp_t mask;
int ret;
 
/* We use sg_set_buf for coherent allocs, which assumes low memory */
@@ -178,13 +180,16 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int 
npages,
while (1 << cur_order > npages)
--cur_order;
 
+   mask = gfp_mask;
+   if (cur_order)
+   mask = (mask & 

Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks

2018-05-29 Thread Eric Dumazet



On 05/29/2018 11:44 PM, Eric Dumazet wrote:

> 
> And I will add this simple fix, this really should address your initial 
> concern much better.
> 
> @@ -99,6 +100,8 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, 
> int order,
>  {
> struct page *page;
>  
> +   if (order)
> +   gfp_mask |= __GFP_NORETRY;

and also  gfp_mask &= ~__GFP_DIRECT_RECLAIM


> page = alloc_pages_node(node, gfp_mask, order);
> if (!page) {
> page = alloc_pages(gfp_mask, order);
> 



Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks

2018-05-29 Thread Eric Dumazet



On 05/29/2018 11:34 PM, Eric Dumazet wrote:

> I will test :
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
> b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index 
> 685337d58276fc91baeeb64387c52985e1bc6dda..4d2a71381acb739585d662175e86caef72338097
>  100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,13 @@
>  #include "fw.h"
>  
>  /*
> - * We allocate in page size (default 4KB on many archs) chunks to avoid high
> - * order memory allocations in fragmented/high usage memory situation.
> + * We allocate in as big chunks as we can, up to a maximum of 256 KB
> + * per chunk. Note that the chunks are not necessarily in contiguous
> + * physical memory.
>   */
>  enum {
> -   MLX4_ICM_ALLOC_SIZE = PAGE_SIZE,
> -   MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
> +   MLX4_ICM_ALLOC_SIZE = 1 << 18,
> +   MLX4_TABLE_CHUNK_SIZE   = 1 << 18
>  };
>  
>  static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk 
> *chunk)
> 

And I will add this simple fix, this really should address your initial concern 
much better.

@@ -99,6 +100,8 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int 
order,
 {
struct page *page;
 
+   if (order)
+   gfp_mask |= __GFP_NORETRY;
page = alloc_pages_node(node, gfp_mask, order);
if (!page) {
page = alloc_pages(gfp_mask, order);


Re: [net] vhost: Use kzalloc() to allocate vhost_msg_node

2018-05-29 Thread Guenter Roeck

On 05/29/2018 08:01 PM, Michael S. Tsirkin wrote:

On Tue, May 29, 2018 at 03:19:08PM -0700, Guenter Roeck wrote:

On Fri, Apr 27, 2018 at 11:45:02AM -0400, Kevin Easton wrote:

The struct vhost_msg within struct vhost_msg_node is copied to userspace,
so it should be allocated with kzalloc() to ensure all structure padding
is zeroed.

Signed-off-by: Kevin Easton 
Reported-by: syzbot+87cfa083e727a2247...@syzkaller.appspotmail.com


Is this patch going anywhere ?

The patch fixes CVE-2018-1118. It would be useful to understand if and when
this problem is going to be fixed.

Thanks,
Guenter

---
  drivers/vhost/vhost.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e9..1b84dcff 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2339,7 +2339,7 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
  /* Create a new message. */
  struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
  {
-   struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+   struct vhost_msg_node *node = kzalloc(sizeof *node, GFP_KERNEL);
if (!node)
return NULL;
node->vq = vq;


As I pointed out, we don't need to init the whole structure. The proper
fix is thus (I think) below.

Could you use your testing infrastructure to confirm this fixes the issue?



Sorry, I don't have the means to test the fix.

Guenter


Thanks!

Signed-off-by: Michael S. Tsirkin 

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e941224..58d9aec90afb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2342,6 +2342,9 @@ struct vhost_msg_node *vhost_new_msg(struct 
vhost_virtqueue *vq, int type)
struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
if (!node)
return NULL;
+
+   /* Make sure all padding within the structure is initialized. */
+   memset(>msg, 0, sizeof node->msg);
node->vq = vq;
node->msg.type = type;
return node;





Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks

2018-05-29 Thread Eric Dumazet



On 05/25/2018 10:23 AM, David Miller wrote:
> From: Qing Huang 
> Date: Wed, 23 May 2018 16:22:46 -0700
> 
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>
>> However in order to support smaller ICM chunk size, we need to fix
>> another issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use kvzalloc to replace kcalloc which will fall back
>> to vmalloc automatically if kmalloc fails.
>>
>> Signed-off-by: Qing Huang 
>> Acked-by: Daniel Jurgens 
>> Reviewed-by: Zhu Yanjun 
> 
> Applied, thanks.
> 

I must say this patch causes regressions here.

KASAN is not happy.

It looks that you guys did not really looked at mlx4_alloc_icm()

This function is properly handling high order allocations with fallbacks to 
order-0 pages
under high memory pressure.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr 8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G   O 
Call Trace:
 [] dump_stack+0x4d/0x72
 [] print_address_description+0x6f/0x260
 [] kasan_report+0x257/0x370
 [] __asan_report_load4_noabort+0x19/0x20
 [] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
 [] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
 [] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
 [] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
 [] seq_read+0xa9c/0x1230
 [] proc_reg_read+0xc1/0x180
 [] __vfs_read+0xe8/0x730
 [] vfs_read+0xf7/0x300
 [] SyS_read+0xd2/0x1b0
 [] do_syscall_64+0x186/0x420
 [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:7ffd09a758c0 EFLAGS: 0293 ORIG_RAX: 
RAX: ffda RBX: 7f84ff959440 RCX: 7f851a7bb30d
RDX: 0003fc00 RSI: 7f84ff60a000 RDI: 000b
RBP: 7ffd09a75900 R08:  R09: 
R10: 0022 R11: 0293 R12: 
R13: 0003 R14: 0003 R15: 7f84ff60a000

Allocated by task 4488: 
 save_stack+0x46/0xd0
 kasan_kmalloc+0xad/0xe0
 __kmalloc+0x101/0x5e0
 ib_register_device+0xc03/0x1250 [ib_core]
 mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
 mlx4_add_device+0xa9/0x340 [mlx4_core]
 mlx4_register_interface+0x16e/0x390 [mlx4_core]
 xhci_pci_remove+0x7a/0x180 [xhci_pci]
 do_one_initcall+0xa0/0x230
 do_init_module+0x1b9/0x5a4
 load_module+0x63e6/0x94c0
 SYSC_init_module+0x1a4/0x1c0
 SyS_init_module+0xe/0x10
 do_syscall_64+0x186/0x420
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at 8817df584f40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
 32-byte region [8817df584f40, 8817df584f60)
The buggy address belongs to the page:
page:ea005f7d6100 count:1 mapcount:0 mapping:8817df584000 
index:0x8817df584fc1
flags: 0x8800100(slab)
raw: 08800100 8817df584000 8817df584fc1 0001003f
raw: ea005f3ac0a0 ea005c476760 8817fec00900 883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:883ff78d26c0

Memory state around the buggy address:
 8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
 8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc
>8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
  ^
 8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

I will test :

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 
685337d58276fc91baeeb64387c52985e1bc6dda..4d2a71381acb739585d662175e86caef72338097
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
 #include "fw.h"
 
 /*
- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big 

Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-29 Thread Michael Chan
On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar
 wrote:

>
> Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
> extended?

I didn't know about that.

> Shouldn't we enable this via ethtool on the port representor netdev?
>
>

We discussed about this.  ethtool on the VF representor will only work
in switchdev mode and also will not support min/max values.


Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-29 Thread Andrew Lunn
On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> From: Ilan Tayari 
> 
> The FPGA QP event fires whenever a QP on the FPGA trasitions
> to the error state.

FPGA i know, field programmable gate array. Could you offer some clue
as to what QP means?

   Thanks
Andrew


Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-29 Thread Andrew Lunn
On Tue, May 29, 2018 at 05:19:53PM -0700, Saeed Mahameed wrote:
> From: Ilan Tayari 
> 
> Temperature warning event is sent by FW to indicate high temperature
> as detected by one of the sensors on the board.
> Add handling of this event by writing the numbers of the alert sensors
> to the kernel log.

Hi Saaed

Is the temperature itself available? If so, it would be better to
expose this as a hwmon device per temperature sensor.

   Andrew


Re: [PATCH bpf v2 0/5] fix test_sockmap

2018-05-29 Thread Prashant Bhole




On 5/30/2018 12:48 AM, John Fastabend wrote:

On 05/27/2018 09:37 PM, Prashant Bhole wrote:

This series fixes error handling, timeout and data verification in
test_sockmap. Previously it was not able to detect failure/timeout in
RX/TX thread because error was not notified to the main thread.

Also slightly improved test output by printing parameter values (cork,
apply, start, end) so that parameters for all tests are displayed.

Prashant Bhole (5):
   selftests/bpf: test_sockmap, check test failure
   selftests/bpf: test_sockmap, join cgroup in selftest mode
   selftests/bpf: test_sockmap, fix test timeout
   selftests/bpf: test_sockmap, fix data verification
   selftests/bpf: test_sockmap, print additional test options

  tools/testing/selftests/bpf/test_sockmap.c | 76 +-
  1 file changed, 58 insertions(+), 18 deletions(-)



After first patch "check test failure" how do we handle the case
where test is known to cause timeouts because we are specifically testing
these cases. This is the 'cork' parameter we discussed in the last
series. It looks like with this series the test may still throw an
error?


Sorry. In your comment in last series, did you mean to skip error 
checking only for all cork tests (for now)?


-Prashant



[net-next 6/7] net/mlx5: FPGA, Call DMA unmap with the right size

2018-05-29 Thread Saeed Mahameed
From: Ilya Lesokhin 

When mlx5_fpga_conn_unmap_buf is called buf->sg[0].size
should equal the actual buffer size, not the message size.
Otherwise we will trigger the following dma debug warning
"DMA-API: device driver frees DMA memory with different size"

Fixes: 537a50574175 ('net/mlx5: FPGA, Add high-speed connection routines')
Signed-off-by: Ilya Lesokhin 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index bf84678b21d6..4138a770ed57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -256,8 +256,6 @@ static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn 
*conn,
ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.rq.size - 1);
buf = conn->qp.rq.bufs[ix];
conn->qp.rq.bufs[ix] = NULL;
-   if (!status)
-   buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
conn->qp.rq.cc++;
 
if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
@@ -275,6 +273,7 @@ static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn 
*conn,
return;
}
 
+   buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
mlx5_fpga_dbg(conn->fdev, "Message with %u bytes received 
successfully\n",
  buf->sg[0].size);
conn->recv_cb(conn->cb_arg, buf);
-- 
2.17.0



[net-next 5/7] net/mlx5: FPGA, Properly initialize dma direction on fpga conn send

2018-05-29 Thread Saeed Mahameed
From: Ilya Lesokhin 

Properly initialize dma direction on fpga conn send.
Do not rely on dma_dir == 0 (DMA_BIDIRECTIONAL).

Signed-off-by: Ilya Lesokhin 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index 4e5a5cf25f17..bf84678b21d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -181,6 +181,7 @@ int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
if (!conn->qp.active)
return -ENOTCONN;
 
+   buf->dma_dir = DMA_TO_DEVICE;
err = mlx5_fpga_conn_map_buf(conn, buf);
if (err)
return err;
-- 
2.17.0



[net-next 7/7] net/mlx5e: Get the number of offloaded TC rules from the correct table

2018-05-29 Thread Saeed Mahameed
From: Or Gerlitz 

As we keep the offloaded TC rules for NIC and e-switch in two different
places, make sure to return the number of offloaded flows according
to the use-case and not blindly from the priv.

Fixes: 655dc3d2b91b ('net/mlx5e: Use shared table for offloaded TC eswitch 
flows')
Signed-off-by: Or Gerlitz 
Reported-by: Paul Blakey 
Reviewed-by: Paul Blakey 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 7 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 5 +
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9372d914abe5..0edf4751a8ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2876,3 +2876,10 @@ void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht)
 {
rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
 }
+
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
+{
+   struct rhashtable *tc_ht = get_tc_ht(priv);
+
+   return atomic_read(_ht->nelems);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 59e52b845beb..49436bf3b80a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -68,10 +68,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 struct mlx5e_neigh_hash_entry;
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
 
-static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
-{
-   return atomic_read(>fs.tc.ht.nelems);
-}
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv);
 
 #else /* CONFIG_MLX5_ESWITCH */
 static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
-- 
2.17.0



[net-next 3/7] net/mlx5: FPGA, print SBU identification on init

2018-05-29 Thread Saeed Mahameed
From: Ilan Tayari 

Add print of the following values on init:
1. ieee vendor id
2. sandbox product id
3. sandbox product version

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 8531098a7f19..02319f779a49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -160,11 +160,14 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
goto out;
 
fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device);
-   mlx5_fpga_info(fdev, "%s:%u; %s image, version %u\n",
+   mlx5_fpga_info(fdev, "%s:%u; %s image, version %u; SBU %06x:%04x 
version %d\n",
   mlx5_fpga_device_name(fpga_device_id),
   fpga_device_id,
   mlx5_fpga_image_name(fdev->last_oper_image),
-  MLX5_CAP_FPGA(fdev->mdev, image_version));
+  MLX5_CAP_FPGA(fdev->mdev, image_version),
+  MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id),
+  MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id),
+  MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
 
max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
err = mlx5_core_reserve_gids(mdev, max_num_qps);
-- 
2.17.0



[net-next 2/7] net/mlx5: FPGA, Add device name

2018-05-29 Thread Saeed Mahameed
From: Ilan Tayari 

Add device name for Mellanox FPGA devices.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 .../ethernet/mellanox/mlx5/core/fpga/cmd.h|  7 ++
 .../ethernet/mellanox/mlx5/core/fpga/core.c   | 24 ---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
index d05233c9b4f6..eb8b0fe0b4e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -35,6 +35,13 @@
 
 #include 
 
+enum mlx5_fpga_device_id {
+   MLX5_FPGA_DEVICE_UNKNOWN = 0,
+   MLX5_FPGA_DEVICE_KU040 = 1,
+   MLX5_FPGA_DEVICE_KU060 = 2,
+   MLX5_FPGA_DEVICE_KU060_2 = 3,
+};
+
 enum mlx5_fpga_image {
MLX5_FPGA_IMAGE_USER = 0,
MLX5_FPGA_IMAGE_FACTORY,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index dc8970346521..8531098a7f19 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -75,6 +75,21 @@ static const char *mlx5_fpga_image_name(enum mlx5_fpga_image 
image)
}
 }
 
+static const char *mlx5_fpga_device_name(u32 device)
+{
+   switch (device) {
+   case MLX5_FPGA_DEVICE_KU040:
+   return "ku040";
+   case MLX5_FPGA_DEVICE_KU060:
+   return "ku060";
+   case MLX5_FPGA_DEVICE_KU060_2:
+   return "ku060_2";
+   case MLX5_FPGA_DEVICE_UNKNOWN:
+   default:
+   return "unknown";
+   }
+}
+
 static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev)
 {
struct mlx5_fpga_query query;
@@ -128,8 +143,9 @@ static int mlx5_fpga_device_brb(struct mlx5_fpga_device 
*fdev)
 int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 {
struct mlx5_fpga_device *fdev = mdev->fpga;
-   unsigned long flags;
unsigned int max_num_qps;
+   unsigned long flags;
+   u32 fpga_device_id;
int err;
 
if (!fdev)
@@ -143,8 +159,10 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
if (err)
goto out;
 
-   mlx5_fpga_info(fdev, "device %u; %s image, version %u\n",
-  MLX5_CAP_FPGA(fdev->mdev, fpga_device),
+   fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device);
+   mlx5_fpga_info(fdev, "%s:%u; %s image, version %u\n",
+  mlx5_fpga_device_name(fpga_device_id),
+  fpga_device_id,
   mlx5_fpga_image_name(fdev->last_oper_image),
   MLX5_CAP_FPGA(fdev->mdev, image_version));
 
-- 
2.17.0



[net-next 4/7] net/mlx5: FPGA, Abort FPGA init if the device reports no QP capability

2018-05-29 Thread Saeed Mahameed
From: Yevgeny Kliteynik 

In the case that the reported max number of QPs capability
equals to zero, abort FPGA init.

Signed-off-by: Yevgeny Kliteynik 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 02319f779a49..26caa0475985 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -170,6 +170,12 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
   MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
 
max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+   if (!max_num_qps) {
+   mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n");
+   err = -ENOTSUPP;
+   goto out;
+   }
+
err = mlx5_core_reserve_gids(mdev, max_num_qps);
if (err)
goto out;
-- 
2.17.0



[net-next 1/7] net/mlx5: FPGA, Add doxygen for access type enum

2018-05-29 Thread Saeed Mahameed
From: Ilan Tayari 

Add doxygen comments for enum mlx5_fpga_access_type.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
index a0573cc2fc9b..656f96be6e20 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
@@ -44,8 +44,14 @@
 #define SBU_QP_QUEUE_SIZE 8
 #define MLX5_FPGA_CMD_TIMEOUT_MSEC (60 * 1000)
 
+/**
+ * enum mlx5_fpga_access_type - Enumerated the different methods possible for
+ * accessing the device memory address space
+ */
 enum mlx5_fpga_access_type {
+   /** Use the slow CX-FPGA I2C bus */
MLX5_FPGA_ACCESS_TYPE_I2C = 0x0,
+   /** Use the fastest available method */
MLX5_FPGA_ACCESS_TYPE_DONTCARE = 0x0,
 };
 
-- 
2.17.0



[pull request][net-next 0/7] Mellanox, mlx5e & FPGA updates 2018-05-29

2018-05-29 Thread Saeed Mahameed
Hi Dave,

The following series includes some minor FPGA and mlx5e netdev updates,
for more information please see tag log below.

Please pull and let me know if there's any problem.

Note: This series doesn't include nor require any mlx5-next shared code
and can be applied as is to net-next tree.

Thanks,
Saeed.

---

The following changes since commit ae40832e53c33fab2755571dabc1378117bc50d4:

  bpfilter: fix a build err (2018-05-29 15:20:21 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-05-29

for you to fetch changes up to 01252a27837a9dd099a6e8cfa3adc4772033a5bf:

  net/mlx5e: Get the number of offloaded TC rules from the correct table 
(2018-05-29 17:27:50 -0700)


mlx5e-updates-2018-05-29

This series includes mlx5 FPGA and mlx5e netdevice updates:

1) Print FPGA info such as device name, vendor id, etc.., from Ilan Tayari.
2) Abort FPGA if some essential capabilities are not supported, from Yevgeny 
Kliteynik.
3) Two FPGA dma related minor fixes, from Ilya Lesokhin.
4) Use the right table to report offloaded TC rules, from Or Gerlitz.


Ilan Tayari (3):
  net/mlx5: FPGA, Add doxygen for access type enum
  net/mlx5: FPGA, Add device name
  net/mlx5: FPGA, print SBU identification on init

Ilya Lesokhin (2):
  net/mlx5: FPGA, Properly initialize dma direction on fpga conn send
  net/mlx5: FPGA, Call DMA unmap with the right size

Or Gerlitz (1):
  net/mlx5e: Get the number of offloaded TC rules from the correct table

Yevgeny Kliteynik (1):
  net/mlx5: FPGA, Abort FPGA init if the device reports no QP capability

 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  7 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h|  5 +---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h |  7 +
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c|  4 +--
 .../net/ethernet/mellanox/mlx5/core/fpga/core.c| 35 +++---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h |  6 
 6 files changed, 54 insertions(+), 10 deletions(-)


[PATCH] ixgbe: check ipsec ip addr against mgmt filter

2018-05-29 Thread Shannon Nelson
Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg 
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 64 ++
 1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f..ea3b5fa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -445,6 +445,65 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
 }
 
 /**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct ixgbe_hw *hw = >hw;
+   u32 mfval, manc, reg;
+   int num_filters = 4;
+   bool manc_ipv4;
+   int i, j;
+
+#define MANC_EN_IPV4_FILTER  BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)(IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+   manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+   manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+   mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+
+   if (xs->props.family == AF_INET) {
+   /* are there any IPv4 filters to check? */
+   if (!manc_ipv4)
+   return 0;
+
+   /* the 4 ipv4 filters are all in MIPAF(3, i) */
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+   continue;
+
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+   } else {
+   /* if there are ipv4 filters, they are in the last ipv6 slot */
+   if (manc_ipv4)
+   num_filters = 3;
+
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+   continue;
+
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+   }
+
+   return 0;
+}
+
+/**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
  **/
@@ -465,6 +524,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
return -EINVAL;
}
 
+   if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+   netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+   return -EINVAL;
+   }
+
if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
struct rx_sa rsa;
 
-- 
2.7.4



[PATCH mlx5-next 0/2] Mellanox, mlx5 new device events

2018-05-29 Thread Saeed Mahameed
Hi, 

The following series is for mlx5-next tree [1], it adds the support of two
new device events, from Ilan Tayari:

1. High temperature warnings.
2. FPGA QP error event.

In case of no objection this series will be applied to mlx5-next tree
and will be sent later as a pull request to both rdma and net trees.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-next

Thanks,
Saeed.

Ilan Tayari (2):
  net/mlx5: Add temperature warning event to log
  net/mlx5: Add FPGA QP error event

 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 28 +++-
 include/linux/mlx5/device.h  |  8 ++
 include/linux/mlx5/mlx5_ifc.h|  3 ++-
 include/linux/mlx5/mlx5_ifc_fpga.h   | 16 +++
 4 files changed, 53 insertions(+), 2 deletions(-)

-- 
2.17.0



[PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-29 Thread Saeed Mahameed
From: Ilan Tayari 

The FPGA QP event fires whenever a QP on the FPGA trasitions
to the error state.

At this stage, this event is unrecoverable, it may become recoverable
in the future.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c |  7 +--
 include/linux/mlx5/device.h  |  1 +
 include/linux/mlx5/mlx5_ifc.h|  1 +
 include/linux/mlx5/mlx5_ifc_fpga.h   | 16 
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 4bd4f011f0a9..77c685645c66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type)
return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
case MLX5_EVENT_TYPE_FPGA_ERROR:
return "MLX5_EVENT_TYPE_FPGA_ERROR";
+   case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+   return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
case MLX5_EVENT_TYPE_GENERAL_EVENT:
return "MLX5_EVENT_TYPE_GENERAL_EVENT";
default:
@@ -560,6 +562,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
break;
 
case MLX5_EVENT_TYPE_FPGA_ERROR:
+   case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
mlx5_fpga_event(dev, eqe->type, >data.raw);
break;
 
@@ -839,11 +842,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
 
if (MLX5_CAP_GEN(dev, fpga))
-   async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+   async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+   (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
if (MLX5_CAP_GEN_MAX(dev, dct))
async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
-
if (MLX5_CAP_GEN(dev, temp_warn_event))
async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eddacee5cf61..71e1dc2523a6 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -331,6 +331,7 @@ enum mlx5_event {
MLX5_EVENT_TYPE_DCT_DRAINED= 0x1c,
 
MLX5_EVENT_TYPE_FPGA_ERROR = 0x20,
+   MLX5_EVENT_TYPE_FPGA_QP_ERROR  = 0x21,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ba30c26aa6eb..3e8845dc85fe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -60,6 +60,7 @@ enum {
MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION= 0xa,
MLX5_EVENT_TYPE_CODING_PAGE_REQUEST= 0xb,
MLX5_EVENT_TYPE_CODING_FPGA_ERROR  = 0x20,
+   MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR   = 0x21
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h 
b/include/linux/mlx5/mlx5_ifc_fpga.h
index ec052491ba3d..7ddca31fa05d 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -432,6 +432,22 @@ struct mlx5_ifc_ipsec_counters_bits {
u8 dropped_cmd[0x40];
 };
 
+enum {
+   MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED  = 0x1,
+   MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED= 0x2,
+};
+
+struct mlx5_ifc_fpga_qp_error_event_bits {
+   u8 reserved_at_0[0x40];
+
+   u8 reserved_at_40[0x18];
+   u8 syndrome[0x8];
+
+   u8 reserved_at_60[0x60];
+
+   u8 reserved_at_c0[0x8];
+   u8 fpga_qpn[0x18];
+};
 enum mlx5_ifc_fpga_ipsec_response_syndrome {
MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
-- 
2.17.0



[PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-29 Thread Saeed Mahameed
From: Ilan Tayari 

Temperature warning event is sent by FW to indicate high temperature
as detected by one of the sensors on the board.
Add handling of this event by writing the numbers of the alert sensors
to the kernel log.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 
 include/linux/mlx5/device.h  |  7 ++
 include/linux/mlx5/mlx5_ifc.h|  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index c1c94974e16b..4bd4f011f0a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -141,6 +141,8 @@ static const char *eqe_type_str(u8 type)
return "MLX5_EVENT_TYPE_GPIO_EVENT";
case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+   case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+   return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
case MLX5_EVENT_TYPE_REMOTE_CONFIG:
return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -393,6 +395,20 @@ static void general_event_handler(struct mlx5_core_dev 
*dev,
}
 }
 
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+   struct mlx5_eqe *eqe)
+{
+   u64 value_lsb;
+   u64 value_msb;
+
+   value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+   value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+   mlx5_core_warn(dev,
+  "High temperature on sensors with bit set %llx %llx",
+  value_msb, value_lsb);
+}
+
 /* caller must eventually call mlx5_cq_put on the returned cq */
 static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 {
@@ -547,6 +563,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
mlx5_fpga_event(dev, eqe->type, >data.raw);
break;
 
+   case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+   mlx5_temp_warning_event(dev, eqe);
+   break;
+
case MLX5_EVENT_TYPE_GENERAL_EVENT:
general_event_handler(dev, eqe);
break;
@@ -824,6 +844,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
 
+   if (MLX5_CAP_GEN(dev, temp_warn_event))
+   async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
err = mlx5_create_map_eq(dev, >cmd_eq, MLX5_EQ_VEC_CMD,
 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..eddacee5cf61 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -314,6 +314,7 @@ enum mlx5_event {
MLX5_EVENT_TYPE_PORT_CHANGE= 0x09,
MLX5_EVENT_TYPE_GPIO_EVENT = 0x15,
MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+   MLX5_EVENT_TYPE_TEMP_WARN_EVENT= 0x17,
MLX5_EVENT_TYPE_REMOTE_CONFIG  = 0x19,
MLX5_EVENT_TYPE_GENERAL_EVENT  = 0x22,
MLX5_EVENT_TYPE_PPS_EVENT  = 0x25,
@@ -626,6 +627,11 @@ struct mlx5_eqe_dct {
__be32  dctn;
 };
 
+struct mlx5_eqe_temp_warning {
+   __be64 sensor_warning_msb;
+   __be64 sensor_warning_lsb;
+} __packed;
+
 union ev_data {
__be32  raw[7];
struct mlx5_eqe_cmd cmd;
@@ -642,6 +648,7 @@ union ev_data {
struct mlx5_eqe_port_module port_module;
struct mlx5_eqe_pps pps;
struct mlx5_eqe_dct dct;
+   struct mlx5_eqe_temp_warningtemp_warning;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 10c1613d9434..ba30c26aa6eb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -926,7 +926,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 log_max_msg[0x5];
u8 reserved_at_1c8[0x4];
u8 max_tc[0x4];
-   u8 reserved_at_1d0[0x1];
+   u8 temp_warn_event[0x1];
u8 dcbx[0x1];
u8 general_notification_event[0x1];
u8 reserved_at_1d3[0x2];
-- 
2.17.0



Re: [PATCH bpf-next 11/11] bpf, doc: add missing patchwork url and libbpf to maintainers

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Add missing bits under tools/lib/bpf/ and also Q: entry in order to
> make it easier for people to retrieve current patch queue.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 


> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f492431..2fd51db 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2722,6 +2722,7 @@ L:netdev@vger.kernel.org
>  L: linux-ker...@vger.kernel.org
>  T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
>  T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
> +Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
>  S: Supported
>  F: arch/x86/net/bpf_jit*
>  F: Documentation/networking/filter.txt
> @@ -2740,6 +2741,7 @@ F:net/sched/act_bpf.c
>  F: net/sched/cls_bpf.c
>  F: samples/bpf/
>  F: tools/bpf/
> +F: tools/lib/bpf/
>  F: tools/testing/selftests/bpf/
>
>  BROADCOM B44 10/100 ETHERNET DRIVER
> --
> 2.9.5
>


Re: [PATCH bpf-next] bpftool: Support sendmsg{4,6} attach types

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 2:20 PM, Jakub Kicinski  wrote:
> On Tue, 29 May 2018 13:29:31 -0700, Andrey Ignatov wrote:
>> Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
>> BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
>> and bash completion.
>>
>> Signed-off-by: Andrey Ignatov 
>
> Reviewed-by: Jakub Kicinski 
>
>> I'm not sure about "since 4.18" in Documentation part. I can follow-up when
>> the next kernel version is known.
>
> IMHO it's fine, we can follow up if Linus decides to call it something
> else :)
>
> Thanks!

Acked-by: Song Liu 


Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress

2018-05-29 Thread Jiri Pirko
Tue, May 29, 2018 at 04:08:48PM CEST, john.hur...@netronome.com wrote:
>On Sat, May 26, 2018 at 3:47 AM, Jakub Kicinski
> wrote:
>> On Fri, 25 May 2018 08:48:09 +0200, Jiri Pirko wrote:
>>> Thu, May 24, 2018 at 04:22:47AM CEST, jakub.kicin...@netronome.com wrote:
>>> >Hi!
>>> >
>>> >This series from John adds bond offload to the nfp driver.  Patch 5
>>> >exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
>>> >hashing matches that of the software LAG.  This may be unnecessarily
>>> >conservative, let's see what LAG maintainers think :)
>>>
>>> So you need to restrict offload to only certain hash algo? In mlxsw, we
>>> just ignore the lag setting and do some hw default hashing. Would not be
>>> enough? Note that there's a good reason for it, as you see, in team, the
>>> hashing is done in a BPF function and could be totally arbitrary.
>>> Your patchset effectively disables team offload for nfp.
>>
>> My understanding is that the project requirements only called for L3/L4
>> hash algorithm offload, hence the temptation to err on the side of
>> caution and not offload all the bond configurations.  John can provide
>> more details.  Not being able to offload team is unfortunate indeed.
>
>Hi Jiri,
>Yes, as Jakub mentions, we restrict ourselves to L3/L4 hash algorithm
>as this is currently what is supported in fw.

In mlxsw, a default l3/l4 is used always, no matter what the
bonding/team sets. It is not correct, but it works with team as well.
Perhaps we can have NETDEV_LAG_HASH_UNKNOWN to indicate to the driver to
do some default? That would make the "team" offload functional.

>Hopefully this will change as fw features are expanded.
>I understand the issue this presents with offloading team.
>Perhaps resorting to a default hw hash for team is acceptable.
>John


Re: [PATCH mlx5-next 1/3] net/mlx5: Exposing a new mini-CQE format

2018-05-29 Thread Jason Gunthorpe
On Tue, May 29, 2018 at 03:01:27PM -0600, Saeed Mahameed wrote:
> On Sun, 2018-05-27 at 13:42 +0300, Leon Romanovsky wrote:
> > From: Yonatan Cohen 
> > 
> > The new mini-CQE format includes byte-count, checksum
> > and stride index.
> > 
> > Reviewed-by: Yishai Hadas 
> > Reviewed-by: Guy Levi 
> > Signed-off-by: Yonatan Cohen 
> > Signed-off-by: Leon Romanovsky 
> 
> 
> Applied to mlx5-next.
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git  mlx5-
> next
> 
> commit-id: ab741b2eed3e456cebd2240d4c9c6be003d5ae72

Thanks, everything is now merged as:

https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?h=wip/jgg-for-next=f3ca0ab114e0de3bbad4c4a537d32fb57aa42f81

Jason


Re: [PATCH bpf-next] bpftool: Support sendmsg{4,6} attach types

2018-05-29 Thread Jakub Kicinski
On Tue, 29 May 2018 13:29:31 -0700, Andrey Ignatov wrote:
> Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
> BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
> and bash completion.
> 
> Signed-off-by: Andrey Ignatov 

Reviewed-by: Jakub Kicinski 

> I'm not sure about "since 4.18" in Documentation part. I can follow-up when
> the next kernel version is known.

IMHO it's fine, we can follow up if Linus decides to call it something
else :)

Thanks!


Re: [PATCH net-next 0/7] net/ipv6: Fix route append and replace use cases

2018-05-29 Thread Thomas Winter
The only thing this breaks is adding IPv6 routes via ioctl. Previously they 
would be automatically appended to form multipath routes but this no longer 
occurs. Changing to netlink or use iproute2 and using NLM_F_APPEND gets around 
this. This basically what David Ahern said earlier but I wanted to make it 
clear the default ioctl behaviour has changed.

From: David Ahern 
Sent: 23 May 2018 08:44
To: David Miller; dsah...@kernel.org
Cc: netdev@vger.kernel.org; Thomas Winter; ido...@mellanox.com; 
sha...@cumulusnetworks.com; ro...@cumulusnetworks.com
Subject: Re: [PATCH net-next 0/7] net/ipv6: Fix route append and replace use 
cases

On 5/22/18 12:46 PM, David Miller wrote:
>
> Ok, I'll apply this series.
>
> But if this breaks things for anyone in a practical way, I am unfortunately
> going to have to revert no matter how silly the current behavior may be.
>

Understood. I have to try the best option first. I'll look at
regressions if they happen.


Re: [PATCH mlx5-next 1/3] net/mlx5: Exposing a new mini-CQE format

2018-05-29 Thread Saeed Mahameed
On Sun, 2018-05-27 at 13:42 +0300, Leon Romanovsky wrote:
> From: Yonatan Cohen 
> 
> The new mini-CQE format includes byte-count, checksum
> and stride index.
> 
> Reviewed-by: Yishai Hadas 
> Reviewed-by: Guy Levi 
> Signed-off-by: Yonatan Cohen 
> Signed-off-by: Leon Romanovsky 


Applied to mlx5-next.

git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git  mlx5-
next

commit-id: ab741b2eed3e456cebd2240d4c9c6be003d5ae72

Thanks!



Re: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

2018-05-29 Thread Jason Gunthorpe
On Tue, May 29, 2018 at 08:49:58PM +, Ruhl, Michael J wrote:
> >From: Jason Gunthorpe [mailto:j...@mellanox.com]
> >Sent: Tuesday, May 29, 2018 4:21 PM
> >To: Ruhl, Michael J 
> >Cc: Leon Romanovsky ; Doug Ledford
> >; Leon Romanovsky ; RDMA
> >mailing list ; Boris Pismenny
> >; Matan Barak ; Raed
> >Salem ; Yishai Hadas ; Saeed
> >Mahameed ; linux-netdev
> >
> >Subject: Re: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter
> >to ioctl() infrastructure
> >
> >On Tue, May 29, 2018 at 07:31:22PM +, Ruhl, Michael J wrote:
> >> >- struct ib_uverbs_destroy_cq_resp resp;
> >> >  struct ib_uobject *uobj =
> >> >- uverbs_attr_get(attrs,
> >> >UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
> >> >- struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
> >> >-  uobject);
> >> >+ uverbs_attr_get_uobject(attrs,
> >> >UVERBS_ATTR_DESTROY_CQ_HANDLE);
> >> >+ struct ib_uverbs_destroy_cq_resp resp;
> >> >+ struct ib_ucq_object *obj;
> >> >  int ret;
> >> >
> >> >+ if (IS_ERR(uobj))
> >> >+ return PTR_ERR(uobj);
> >> >+
> >>
> >> I remember a conversation that if an method attribute was mandatory, that
> >you did not need to
> >> test the uobj for error (since it was checked in the infrastructure).
> >
> >Yes.
> >
> >> Is this error check necessary?
> >
> >No
> >
> >But there is no way to check one way or the other at compile time
> >right now, and omitting the check makes smatch mad.
> 
> Is smatch going to get mad at (same patch):

Yes, this is where it already got mad, IIRC :( 

Fixing this whole thing is a todo on my list..

Jason


RE: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

2018-05-29 Thread Ruhl, Michael J
>-Original Message-
>From: Jason Gunthorpe [mailto:j...@mellanox.com]
>Sent: Tuesday, May 29, 2018 4:21 PM
>To: Ruhl, Michael J 
>Cc: Leon Romanovsky ; Doug Ledford
>; Leon Romanovsky ; RDMA
>mailing list ; Boris Pismenny
>; Matan Barak ; Raed
>Salem ; Yishai Hadas ; Saeed
>Mahameed ; linux-netdev
>
>Subject: Re: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter
>to ioctl() infrastructure
>
>On Tue, May 29, 2018 at 07:31:22PM +, Ruhl, Michael J wrote:
>> >-   struct ib_uverbs_destroy_cq_resp resp;
>> >struct ib_uobject *uobj =
>> >-   uverbs_attr_get(attrs,
>> >UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
>> >-   struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
>> >-uobject);
>> >+   uverbs_attr_get_uobject(attrs,
>> >UVERBS_ATTR_DESTROY_CQ_HANDLE);
>> >+   struct ib_uverbs_destroy_cq_resp resp;
>> >+   struct ib_ucq_object *obj;
>> >int ret;
>> >
>> >+   if (IS_ERR(uobj))
>> >+   return PTR_ERR(uobj);
>> >+
>>
>> I remember a conversation that if an method attribute was mandatory, that
>you did not need to
>> test the uobj for error (since it was checked in the infrastructure).
>
>Yes.
>
>> Is this error check necessary?
>
>No
>
>But there is no way to check one way or the other at compile time
>right now, and omitting the check makes smatch mad.

Is smatch going to get mad at (same patch):

diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c 
b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index b4f016dfa23d..a7be51cf2e42 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -320,7 +320,7 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device
return ret;

/* No need to check as this attribute is marked as MANDATORY */
-   uobj = uverbs_attr_get(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+   uobj = uverbs_attr_get_uobject(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
action = ib_dev->create_flow_action_esp(ib_dev, _attr.hdr, attrs);
if (IS_ERR(action))
return PTR_ERR(action);
@@ -350,7 +350,7 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device
if (ret)
return ret;

-   uobj = uverbs_attr_get(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+   uobj = uverbs_attr_get_uobject(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
action = uobj->object;

?

If not,

Reviewed-by: Michael J. Ruhl 

Thanks,

Mike

>We need some more patches to be able to safely omit the check...
>
>Jason


Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-29 Thread Samudrala, Sridhar

On 5/29/2018 1:18 AM, Michael Chan wrote:

VF Queue resources are always limited and there is currently no
infrastructure to allow the admin. on the host to add or reduce queue
resources for any particular VF.  With ever increasing number of VFs
being supported, it is desirable to allow the admin. to configure queue
resources differently for the VFs.  Some VFs may require more or fewer
queues due to different bandwidth requirements or different number of
vCPUs in the VM.  This patch adds the infrastructure to do that by
adding IFLA_VF_QUEUES netlink attribute and a new .ndo_set_vf_queues()
to the net_device_ops.

Four parameters are exposed for each VF:

o min_tx_queues - Guaranteed tx queues available to the VF.

o max_tx_queues - Maximum but not necessarily guaranteed tx queues
   available to the VF.

o min_rx_queues - Guaranteed rx queues available to the VF.

o max_rx_queues - Maximum but not necessarily guaranteed rx queues
   available to the VF.

The "ip link set" command will subsequently be patched to support the new
operation to set the above parameters.

After the admin. makes a change to the above parameters, the corresponding
VF will have a new range of channels to set using ethtool -L.  The VF may
have to go through IF down/up before the new queues will take effect.  Up
to the min values are guaranteed.  Up to the max values are possible but not
guaranteed.

Signed-off-by: Michael Chan 
---
  include/linux/if_link.h  |  4 
  include/linux/netdevice.h|  6 ++
  include/uapi/linux/if_link.h |  9 +
  net/core/rtnetlink.c | 32 +---
  4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 622658d..8e81121 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -29,5 +29,9 @@ struct ifla_vf_info {
__u32 rss_query_en;
__u32 trusted;
__be16 vlan_proto;
+   __u32 min_tx_queues;
+   __u32 max_tx_queues;
+   __u32 min_rx_queues;
+   __u32 max_rx_queues;
  };
  #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72..17f5892 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1023,6 +1023,8 @@ struct dev_ifalias {
   *  with PF and querying it may introduce a theoretical security risk.
   * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool 
setting);
   * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff 
*skb);
+ * int (*ndo_set_vf_queues)(struct net_device *dev, int vf, int min_txq,
+ * int max_txq, int min_rxq, int max_rxq);


Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be 
extended?
Shouldn't we enable this via ethtool on the port representor netdev?




[PATCH bpf-next] bpftool: Support sendmsg{4,6} attach types

2018-05-29 Thread Andrey Ignatov
Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
and bash completion.

Signed-off-by: Andrey Ignatov 
---
I'm not sure about "since 4.18" in Documentation part. I can follow-up when
the next kernel version is known.
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 9 +++--
 tools/bpf/bpftool/bash-completion/bpftool  | 5 +++--
 tools/bpf/bpftool/cgroup.c | 4 +++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst 
b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index d004f63..7b0e6d4 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -27,7 +27,8 @@ MAP COMMANDS
 |
 |  *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |  *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | 
**sock_ops** | **device** |
-|  **bind4** | **bind6** | **post_bind4** | **post_bind6** | 
**connect4** | **connect6** }
+|  **bind4** | **bind6** | **post_bind4** | **post_bind6** | 
**connect4** | **connect6** |
+|   **sendmsg4** | **sendmsg6** }
 |  *ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -70,7 +71,11 @@ DESCRIPTION
  **post_bind4** return from bind(2) for an inet4 socket (since 
4.17);
  **post_bind6** return from bind(2) for an inet6 socket (since 
4.17);
  **connect4** call to connect(2) for an inet4 socket (since 
4.17);
- **connect6** call to connect(2) for an inet6 socket (since 
4.17).
+ **connect6** call to connect(2) for an inet6 socket (since 
4.17);
+ **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+ unconnected udp4 socket (since 4.18);
+ **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+ unconnected udp6 socket (since 4.18).
 
**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
  Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
b/tools/bpf/bpftool/bash-completion/bpftool
index 7bc198d..1e10833 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -407,7 +407,7 @@ _bpftool()
 attach|detach)
 local ATTACH_TYPES='ingress egress sock_create sock_ops \
 device bind4 bind6 post_bind4 post_bind6 connect4 \
-connect6'
+connect6 sendmsg4 sendmsg6'
 local ATTACH_FLAGS='multi override'
 local PROG_TYPE='id pinned tag'
 case $prev in
@@ -416,7 +416,8 @@ _bpftool()
 return 0
 ;;
 
ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
-post_bind4|post_bind6|connect4|connect6)
+post_bind4|post_bind6|connect4|connect6|sendmsg4|\
+sendmsg6)
 COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
 "$cur" ) )
 return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 1351bd6..16bee01 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -20,7 +20,7 @@
"   ATTACH_TYPE := { ingress | egress | sock_create |\n"   \
"sock_ops | device | bind4 | bind6 |\n"\
"post_bind4 | post_bind6 | connect4 |\n"   \
-   "connect6 }"
+   "connect6 | sendmsg4 | sendmsg6 }"
 
 static const char * const attach_type_strings[] = {
[BPF_CGROUP_INET_INGRESS] = "ingress",
@@ -34,6 +34,8 @@ static const char * const attach_type_strings[] = {
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+   [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+   [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
-- 
2.9.5



Re: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

2018-05-29 Thread Jason Gunthorpe
On Tue, May 29, 2018 at 07:31:22PM +, Ruhl, Michael J wrote:
> >-struct ib_uverbs_destroy_cq_resp resp;
> > struct ib_uobject *uobj =
> >-uverbs_attr_get(attrs,
> >UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
> >-struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
> >- uobject);
> >+uverbs_attr_get_uobject(attrs,
> >UVERBS_ATTR_DESTROY_CQ_HANDLE);
> >+struct ib_uverbs_destroy_cq_resp resp;
> >+struct ib_ucq_object *obj;
> > int ret;
> >
> >+if (IS_ERR(uobj))
> >+return PTR_ERR(uobj);
> >+
> 
> I remember a conversation that if an method attribute was mandatory, that you 
> did not need to
> test the uobj for error (since it was checked in the infrastructure).

Yes.

> Is this error check necessary?

No

But there is no way to check one way or the other at compile time
right now, and omitting the check makes smatch mad.

We need some more patches to be able to safely omit the check...

Jason


Re: [PATCH rdma-next 0/3] Introduce new mlx5 CQE format

2018-05-29 Thread Jason Gunthorpe
On Sun, May 27, 2018 at 01:42:31PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> Introduce new internal to mlx5 CQE format - mini-CQE. It is a CQE in
> compressed form that holds data needed to extra a single full CQE.
> 
> It stride index, byte count and packet checksum.
> 
> Thanks
> 
> Yonatan Cohen (3):
>   net/mlx5: Exposing a new mini-CQE format
>   IB/mlx5: Refactor CQE compression response
>   IB/mlx5: Introduce a new mini-CQE format

Applied to for-next.

Generally taking new uapi patches that are first the list should have
a few weeks of comment period, but since this is just adding a new bit
to an existing driver private api it seems OK to go this merge window.

Thanks,
Jason


Re: [PATCH] net: sched: split tc_ctl_tfilter into three handlers

2018-05-29 Thread David Miller
From: Vlad Buslov 
Date: Sun, 27 May 2018 22:55:03 +0300

> tc_ctl_tfilter handles three netlink message types: RTM_NEWTFILTER,
> RTM_DELTFILTER, RTM_GETTFILTER. However, implementation of this function
> involves a lot of branching on specific message type because most of the
> code is message-specific. This significantly complicates adding new
> functionality and doesn't provide much benefit of code reuse.
> 
> Split tc_ctl_tfilter to three standalone functions that handle filter new,
> delete and get requests.
> 
> The only truly protocol independent part of tc_ctl_tfilter is code that
> looks up queue, class, and block. Refactor this code to standalone
> tcf_block_find function that is used by all three new handlers.
> 
> Signed-off-by: Vlad Buslov 

This looks fine but doesn't apply cleanly to net-next.


Re: [PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-29 Thread Jason Gunthorpe
On Tue, May 29, 2018 at 04:09:15PM +0300, Leon Romanovsky wrote:
> diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
> index 508ea8c82da7..ef3f430a7050 100644
> +++ b/include/uapi/rdma/mlx5-abi.h
> @@ -443,4 +443,18 @@ enum {
>  enum {
>   MLX5_IB_CLOCK_INFO_V1  = 0,
>  };
> +
> +struct mlx5_ib_flow_counters_data {
> + __aligned_u64   counters_data;
> + __u32   ncounters;
> + __u32   reserved;
> +};
> +
> +struct mlx5_ib_create_flow {
> + __u32   ncounters_data;
> + __u32   reserved;
> + /* Following are counters data based on ncounters_data */
> + struct mlx5_ib_flow_counters_data data[];
> +};
> +
>  #endif /* MLX5_ABI_USER_H */

This uapi thing still needs to be fixed as I pointed out before.

I still can't figure out why this should be a 2d array. I think it
should be written simply as:

struct mlx5_ib_flow_counter_desc {
__u32 description;
__u32 index;
};

struct mlx5_ib_create_flow {
RDMA_UAPI_PTR(struct mlx5_ib_flow_counter_desc, counters_data);
__u32   ncounters;
__u32   reserved;
};

With the corresponding changes elsewhere.

A flex array at the end of a struct means that the struct can never be
extended again which seems like a terrible idea, especially since I
can't fathom why we'd need more that one array of counters and the
current code doesn't even support more than one..

Jason


Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo

2018-05-29 Thread Daniel Borkmann
On 05/29/2018 07:27 PM, Jesper Dangaard Brouer wrote:
> On Mon, 28 May 2018 02:43:37 +0200
> Daniel Borkmann  wrote:
> 
>> Its trivial and straight forward to expose it for scripts that can
>> then use it along with bpftool in order to inspect an individual
>> application's used maps and progs. Right now we dump some basic
>> information in the fdinfo file but with the help of the map/prog
>> id full introspection becomes possible now.
>>
>> Signed-off-by: Daniel Borkmann 
>> Acked-by: Alexei Starovoitov 
> 
> AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
> if this change is handled gracefully by tc ?

Yep, it works just fine, I also tested it before submission.


Re: [PATCH bpf-next] bpf: Drop mpls from bpf_fib_lookup

2018-05-29 Thread Daniel Borkmann
On 05/29/2018 07:58 PM, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> MPLS support will not be submitted this dev cycle, but in working on it
> I do see a few changes are needed to the API. For now, drop mpls from the
> API. Since the fields in question are unions, the mpls fields can be added
> back later without affecting the uapi.
> 
> Signed-off-by: David Ahern 

Applied to bpf-next, thanks David!


Re: [PATCH bpf-next] bpf: Verify flags in bpf_fib_lookup

2018-05-29 Thread Daniel Borkmann
On 05/29/2018 08:59 PM, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> Verify flags argument contains only known flags. Allows programs to probe
> for support as more are added.
> 
> Signed-off-by: David Ahern 

Applied to bpf-next, thanks David!


Re: [PATCH bpf-next] bpf: clean up eBPF helpers documentation

2018-05-29 Thread Daniel Borkmann
On 05/29/2018 08:27 PM, Song Liu wrote:
> On Tue, May 29, 2018 at 4:27 AM, Quentin Monnet
>  wrote:
>> These are minor edits for the eBPF helpers documentation in
>> include/uapi/linux/bpf.h.
>>
>> The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
>> with a non-escaped underscore that gets interpreted by rst2man and
>> produces the following message in the resulting manual page:
>>
>> DOCUTILS SYSTEM MESSAGES
>>System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
>>   Unknown target name: "bpf_fib_lookup".
>>
>> Other edits consist in:
>>
>> - Improving formatting for flag values for "bpf_fib_lookup()" helper.
>> - Emphasising a parameter name in description of the return value for
>>   "bpf_get_stack()" helper.
>> - Removing unnecessary blank lines between "Description" and "Return"
>>   sections for the few helpers that would use it, for consistency.
>>
>> Signed-off-by: Quentin Monnet 
[...]
> 
> Please also apply the same changes to tools/include/uapi/linux/bpf.h.

Just did while applying to bpf-next, thanks guys!

> Other than this, it looks to me.
> 
> Acked-by: Song Liu 
> 
> Thanks,
> Song
> 



RE: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

2018-05-29 Thread Ruhl, Michael J
>-Original Message-
>From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
>ow...@vger.kernel.org] On Behalf Of Leon Romanovsky
>Sent: Tuesday, May 29, 2018 9:09 AM
>To: Doug Ledford ; Jason Gunthorpe
>
>Cc: Leon Romanovsky ; RDMA mailing list r...@vger.kernel.org>; Boris Pismenny ; Matan
>Barak ; Raed Salem ; Yishai
>Hadas ; Saeed Mahameed
>; linux-netdev 
>Subject: [PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to
>ioctl() infrastructure
>
>From: Matan Barak 
>
>Previously, the user had to dig inside the attribute to get the uobject.
>Add a helper function that correctly extract it (and do the required
>checks) for him/her.
>
>Tested-by: Michael Guralnik 
>Signed-off-by: Matan Barak 
>Signed-off-by: Leon Romanovsky 
>---
> drivers/infiniband/core/uverbs_std_types_cq.c  | 23 +++-
>--
> .../infiniband/core/uverbs_std_types_flow_action.c |  4 ++--
> include/rdma/uverbs_ioctl.h| 11 +++
> 3 files changed, 25 insertions(+), 13 deletions(-)
>
>diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c
>b/drivers/infiniband/core/uverbs_std_types_cq.c
>index b0dbae9dd0d7..3d293d01afea 100644
>--- a/drivers/infiniband/core/uverbs_std_types_cq.c
>+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
>@@ -65,7 +65,6 @@ static int
>UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
>   struct ib_cq_init_attr attr = {};
>   struct ib_cq   *cq;
>   struct ib_uverbs_completion_event_file*ev_file = NULL;
>-  const struct uverbs_attr *ev_file_attr;
>   struct ib_uobject *ev_file_uobj;
>
>   if (!(ib_dev->uverbs_cmd_mask & 1ULL <<
>IB_USER_VERBS_CMD_CREATE_CQ))
>@@ -87,10 +86,8 @@ static int
>UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
>
>   UVERBS_ATTR_CREATE_CQ_FLAGS)))
>   return -EFAULT;
>
>-  ev_file_attr = uverbs_attr_get(attrs,
>UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
>-  if (!IS_ERR(ev_file_attr)) {
>-  ev_file_uobj = ev_file_attr->obj_attr.uobject;
>-
>+  ev_file_uobj = uverbs_attr_get_uobject(attrs,
>UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
>+  if (!IS_ERR(ev_file_uobj)) {
>   ev_file = container_of(ev_file_uobj,
>  struct ib_uverbs_completion_event_file,
>  uobj_file.uobj);
>@@ -102,8 +99,8 @@ static int
>UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
>   goto err_event_file;
>   }
>
>-  obj = container_of(uverbs_attr_get(attrs,
>-
>UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject,

See comment below on error checking.  Does this need the error check?

>+  obj = container_of(uverbs_attr_get_uobject(attrs,
>+
>UVERBS_ATTR_CREATE_CQ_HANDLE),
>  typeof(*obj), uobject);
>   obj->uverbs_file   = ucontext->ufile;
>   obj->comp_events_reported  = 0;
>@@ -170,13 +167,17 @@ static int
>UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device
>*ib_dev,
>   struct ib_uverbs_file *file,
>   struct uverbs_attr_bundle
>*attrs)
> {
>-  struct ib_uverbs_destroy_cq_resp resp;
>   struct ib_uobject *uobj =
>-  uverbs_attr_get(attrs,
>UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
>-  struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
>-   uobject);
>+  uverbs_attr_get_uobject(attrs,
>UVERBS_ATTR_DESTROY_CQ_HANDLE);
>+  struct ib_uverbs_destroy_cq_resp resp;
>+  struct ib_ucq_object *obj;
>   int ret;
>
>+  if (IS_ERR(uobj))
>+  return PTR_ERR(uobj);
>+

I remember a conversation that if an method attribute was mandatory, that you 
did not need to
test the uobj for error (since it was checked in the infrastructure).

Is this error check necessary?

Thanks

Mike

>+  obj = container_of(uobj, struct ib_ucq_object, uobject);
>+
>   if (!(ib_dev->uverbs_cmd_mask & 1ULL <<
>IB_USER_VERBS_CMD_DESTROY_CQ))
>   return -EOPNOTSUPP;
>
>diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c
>b/drivers/infiniband/core/uverbs_std_types_flow_action.c
>index b4f016dfa23d..a7be51cf2e42 100644
>--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
>+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
>@@ -320,7 +320,7 @@ static int
>UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct
>ib_device
>   return ret;
>
>   /* No need to check as this attribute is marked as MANDATORY */
>-  uobj = uverbs_attr_get(attrs,
>UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
>+  uobj = uverbs_attr_get_uobject(attrs,
>UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
>   action = ib_dev->create_flow_action_esp(ib_dev, _attr.hdr,
>attrs);
>   if (IS_ERR(action))
>   return PTR_ERR(action);
>@@ 

Re: [PATCH net-next 1/5] net: aquantia: Ethtool based ring size configuration

2018-05-29 Thread Jakub Kicinski
On Tue, 29 May 2018 15:56:58 +0300, Igor Russkikh wrote:
> +static int aq_set_ringparam(struct net_device *ndev,
> + struct ethtool_ringparam *ring)
> +{
> + int err = 0;
> + struct aq_nic_s *aq_nic = netdev_priv(ndev);
> + struct aq_nic_cfg_s *aq_nic_cfg = aq_nic_get_cfg(aq_nic);
> + const struct aq_hw_caps_s *hw_caps = aq_nic_cfg->aq_hw_caps;
> +
> + if (ring->rx_mini_pending || ring->rx_jumbo_pending) {
> + err = -EOPNOTSUPP;
> + goto err_exit;
> + }
> +
> + spin_lock(_nic->aq_spinlock);
> +
> + if (netif_running(ndev))
> + dev_close(ndev);

I don't think you can hold a spinlock around dev_close()/dev_open()
calls.

> + aq_nic_free_vectors(aq_nic);
> +
> + aq_nic_cfg->rxds = max(ring->rx_pending, hw_caps->rxds_min);
> + aq_nic_cfg->rxds = min(aq_nic_cfg->rxds, hw_caps->rxds_max);
> + aq_nic_cfg->rxds = ALIGN(aq_nic_cfg->rxds, AQ_HW_RXD_MULTIPLE);
> +
> + aq_nic_cfg->txds = max(ring->tx_pending, hw_caps->txds_min);
> + aq_nic_cfg->txds = min(aq_nic_cfg->txds, hw_caps->txds_max);
> + aq_nic_cfg->txds = ALIGN(aq_nic_cfg->txds, AQ_HW_TXD_MULTIPLE);
> +
> + for (aq_nic->aq_vecs = 0; aq_nic->aq_vecs < aq_nic_cfg->vecs;
> +  aq_nic->aq_vecs++) {
> + aq_nic->aq_vec[aq_nic->aq_vecs] =
> + aq_vec_alloc(aq_nic, aq_nic->aq_vecs, aq_nic_cfg);
> + if (unlikely(!aq_nic->aq_vec[aq_nic->aq_vecs])) {
> + err = -ENOMEM;
> + goto err_unlock;
> + }
> + }
> + if (!netif_running(ndev))
> + err = dev_open(ndev);

Will this not open the device regardless if it was open before or not?

> +err_unlock:
> + spin_unlock(_nic->aq_spinlock);
> +err_exit:
> + return err;
> +}


[PATCH bpf-next] bpf: Verify flags in bpf_fib_lookup

2018-05-29 Thread dsahern
From: David Ahern 

Verify flags argument contains only known flags. Allows programs to probe
for support as more are added.

Signed-off-by: David Ahern 
---
 net/core/filter.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 24e6ce8be567..4cff6d9cd724 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4270,6 +4270,9 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
if (plen < sizeof(*params))
return -EINVAL;
 
+   if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+   return -EINVAL;
+
switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
case AF_INET:
@@ -4304,6 +4307,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
if (plen < sizeof(*params))
return -EINVAL;
 
+   if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+   return -EINVAL;
+
switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
case AF_INET:
-- 
2.11.0



Re: [pull request][for-next 00/12] Mellanox, mlx5e updates 2018-05-25

2018-05-29 Thread Saeed Mahameed
On Tue, 2018-05-29 at 09:47 -0400, David Miller wrote:
> From: Saeed Mahameed 
> Date: Fri, 25 May 2018 17:01:55 -0700
> 
> > This is a mlx5e only pull request, for more information please see
> > tag
> > log below.
> > 
> > Please pull and let me know if there's any problem.
> 
> Pulled, thanks Saeed.
> 
> There was a minor conflict to resolve (simple overlapping changes).

Yes, Sorry i didn't notify on this in advance, the merge commit looks
ok, thanks a lot!.



Re: [PATCH bpf-next] bpf: clean up eBPF helpers documentation

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 4:27 AM, Quentin Monnet
 wrote:
> These are minor edits for the eBPF helpers documentation in
> include/uapi/linux/bpf.h.
>
> The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
> with a non-escaped underscore that gets interpreted by rst2man and
> produces the following message in the resulting manual page:
>
> DOCUTILS SYSTEM MESSAGES
>System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
>   Unknown target name: "bpf_fib_lookup".
>
> Other edits consist in:
>
> - Improving formatting for flag values for "bpf_fib_lookup()" helper.
> - Emphasising a parameter name in description of the return value for
>   "bpf_get_stack()" helper.
> - Removing unnecessary blank lines between "Description" and "Return"
>   sections for the few helpers that would use it, for consistency.
>
> Signed-off-by: Quentin Monnet 
> ---
>  include/uapi/linux/bpf.h | 21 ++---
>  1 file changed, 10 insertions(+), 11 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index cc68787f2d97..3f556b35ac8d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1010,7 +1010,6 @@ union bpf_attr {
>   * ::
>   *
>   * # sysctl kernel.perf_event_max_stack=
> - *
>   * Return
>   * The positive or null stack id on success, or a negative error
>   * in case of failure.
> @@ -1821,10 +1820,9 @@ union bpf_attr {
>   * ::
>   *
>   * # sysctl kernel.perf_event_max_stack=
> - *
>   * Return
> - * a non-negative value equal to or less than size on success, or
> - * a negative error in case of failure.
> + * A non-negative value equal to or less than *size* on success,
> + * or a negative error in case of failure.
>   *
>   * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void 
> *to, u32 len, u32 start_header)
>   * Description
> @@ -1845,7 +1843,6 @@ union bpf_attr {
>   * in socket filters where *skb*\ **->data** does not always 
> point
>   * to the start of the mac header and where "direct packet 
> access"
>   * is not available.
> - *
>   * Return
>   * 0 on success, or a negative error in case of failure.
>   *
> @@ -1861,16 +1858,18 @@ union bpf_attr {
>   * rt_metric is set to metric from route.
>   *
>   * *plen* argument is the size of the passed in struct.
> - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
> + * *flags* argument can be a combination of one or more of the
> + * following values:
>   *
> - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
> - * full lookup using FIB rules
> - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
> - * perspective (default is ingress)
> + * **BPF_FIB_LOOKUP_DIRECT**
> + * Do a direct table lookup vs full lookup using FIB
> + * rules.
> + * **BPF_FIB_LOOKUP_OUTPUT**
> + * Perform lookup from an egress perspective (default is
> + * ingress).
>   *
>   * *ctx* is either **struct xdp_md** for XDP programs or
>   * **struct sk_buff** tc cls_act programs.
> - *
>   * Return
>   * Egress device index on success, 0 if packet needs to continue
>   * up the stack for further processing or a negative error in 
> case
> --
> 2.14.1
>

Please also apply the same changes to tools/include/uapi/linux/bpf.h.

Other than this, it looks to me.

Acked-by: Song Liu 

Thanks,
Song


Re: [PATCH bpf-next 03/11] bpf: fixup error message from gpl helpers on license mismatch

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 10:16 AM, Jesper Dangaard Brouer
 wrote:
> On Mon, 28 May 2018 02:43:36 +0200
> Daniel Borkmann  wrote:
>
>> Stating 'proprietary program' in the error is just silly since it
>> can also be a different open source license than that which is just
>> not compatible.
>>
>> Reference: https://twitter.com/majek04/status/998531268039102465
>> Signed-off-by: Daniel Borkmann 
>> Acked-by: Alexei Starovoitov 
>
> Acked-by: Jesper Dangaard Brouer 
>
> Thank you for cleaning up this confusion :-)
>

Acked-by: Song Liu 

>> ---
>>  kernel/bpf/verifier.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 1fd9667b..4f4786e 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2462,7 +2462,7 @@ static int check_helper_call(struct bpf_verifier_env 
>> *env, int func_id, int insn
>>
>>   /* eBPF programs must be GPL compatible to use GPL-ed functions */
>>   if (!env->prog->gpl_compatible && fn->gpl_only) {
>> - verbose(env, "cannot call GPL only function from proprietary 
>> program\n");
>> + verbose(env, "cannot call GPL-restricted function from non-GPL 
>> compatible program\n");
>>   return -EINVAL;
>>   }
>>
>
>
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next 02/11] bpf: add also cbpf long jump test cases with heavy expansion

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> We have one triggering on eBPF but lets also add a cBPF example to
> make sure we keep tracking them. Also add anther cBPF test running
> max number of MSH ops.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 


> ---
>  lib/test_bpf.c | 63 
> ++
>  1 file changed, 63 insertions(+)
>
> diff --git a/lib/test_bpf.c b/lib/test_bpf.c
> index 317f231..60aedc8 100644
> --- a/lib/test_bpf.c
> +++ b/lib/test_bpf.c
> @@ -356,6 +356,52 @@ static int bpf_fill_maxinsns11(struct bpf_test *self)
> return __bpf_fill_ja(self, BPF_MAXINSNS, 68);
>  }
>
> +static int bpf_fill_maxinsns12(struct bpf_test *self)
> +{
> +   unsigned int len = BPF_MAXINSNS;
> +   struct sock_filter *insn;
> +   int i = 0;
> +
> +   insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
> +   if (!insn)
> +   return -ENOMEM;
> +
> +   insn[0] = __BPF_JUMP(BPF_JMP | BPF_JA, len - 2, 0, 0);
> +
> +   for (i = 1; i < len - 1; i++)
> +   insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
> +
> +   insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xabababab);
> +
> +   self->u.ptr.insns = insn;
> +   self->u.ptr.len = len;
> +
> +   return 0;
> +}
> +
> +static int bpf_fill_maxinsns13(struct bpf_test *self)
> +{
> +   unsigned int len = BPF_MAXINSNS;
> +   struct sock_filter *insn;
> +   int i = 0;
> +
> +   insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
> +   if (!insn)
> +   return -ENOMEM;
> +
> +   for (i = 0; i < len - 3; i++)
> +   insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
> +
> +   insn[len - 3] = __BPF_STMT(BPF_LD | BPF_IMM, 0xabababab);
> +   insn[len - 2] = __BPF_STMT(BPF_ALU | BPF_XOR | BPF_X, 0);
> +   insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
> +
> +   self->u.ptr.insns = insn;
> +   self->u.ptr.len = len;
> +
> +   return 0;
> +}
> +
>  static int bpf_fill_ja(struct bpf_test *self)
>  {
> /* Hits exactly 11 passes on x86_64 JIT. */
> @@ -5290,6 +5336,23 @@ static struct bpf_test tests[] = {
> .expected_errcode = -ENOTSUPP,
> },
> {
> +   "BPF_MAXINSNS: jump over MSH",
> +   { },
> +   CLASSIC | FLAG_EXPECTED_FAIL,
> +   { 0xfa, 0xfb, 0xfc, 0xfd, },
> +   { { 4, 0xabababab } },
> +   .fill_helper = bpf_fill_maxinsns12,
> +   .expected_errcode = -EINVAL,
> +   },
> +   {
> +   "BPF_MAXINSNS: exec all MSH",
> +   { },
> +   CLASSIC,
> +   { 0xfa, 0xfb, 0xfc, 0xfd, },
> +   { { 4, 0xababab83 } },
> +   .fill_helper = bpf_fill_maxinsns13,
> +   },
> +   {
> "BPF_MAXINSNS: ld_abs+get_processor_id",
> { },
> CLASSIC,
> --
> 2.9.5
>


Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner
On Wed, May 30, 2018 at 01:45:08AM +0800, Xin Long wrote:
> If we're counting on max_t to fix this CPU stuck. It should not that
> matter if min rto < the value causing that stuck.

Yes but putting a floor to rto_{min,max} now is to protect the rtx
timer now, not the heartbeat one.

> 
> >
> > Anyway, what about we add a floor to rto_max too, so that RTO can
> > actually grow into something bigger that don't hog the CPU? Like:
> > rto_min floor = 5ms
> > rto_max floor = 50ms
> >
> >   Marcelo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


Re: [PATCH bpf-next 01/11] bpf: test case for map pointer poison with calls/branches

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Add several test cases where the same or different map pointers
> originate from different paths in the program and execute a map
> lookup or tail call at a common location.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/linux/filter.h  |  10 ++
>  tools/include/linux/filter.h|  10 ++
>  tools/testing/selftests/bpf/test_verifier.c | 185 
> 
>  3 files changed, 178 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d358d18..b443f70 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -289,6 +289,16 @@ struct xdp_buff;
> .off   = OFF,   \
> .imm   = 0 })
>
> +/* Relative call */
> +
> +#define BPF_CALL_REL(TGT)  \
> +   ((struct bpf_insn) {\
> +   .code  = BPF_JMP | BPF_CALL,\
> +   .dst_reg = 0,   \
> +   .src_reg = BPF_PSEUDO_CALL, \
> +   .off   = 0, \
> +   .imm   = TGT })
> +
>  /* Function call */
>
>  #define BPF_EMIT_CALL(FUNC)\
> diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
> index c5e512d..af55acf 100644
> --- a/tools/include/linux/filter.h
> +++ b/tools/include/linux/filter.h
> @@ -263,6 +263,16 @@
>  #define BPF_LD_MAP_FD(DST, MAP_FD) \
> BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
>
> +/* Relative call */
> +
> +#define BPF_CALL_REL(TGT)  \
> +   ((struct bpf_insn) {\
> +   .code  = BPF_JMP | BPF_CALL,\
> +   .dst_reg = 0,   \
> +   .src_reg = BPF_PSEUDO_CALL, \
> +   .off   = 0, \
> +   .imm   = TGT })
> +
>  /* Program exit */
>
>  #define BPF_EXIT_INSN()\
> diff --git a/tools/testing/selftests/bpf/test_verifier.c 
> b/tools/testing/selftests/bpf/test_verifier.c
> index 4b4f015..7cb1d74 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -50,7 +50,7 @@
>
>  #define MAX_INSNS  BPF_MAXINSNS
>  #define MAX_FIXUPS 8
> -#define MAX_NR_MAPS4
> +#define MAX_NR_MAPS7
>  #define POINTER_VALUE  0xcafe4all
>  #define TEST_DATA_LEN  64
>
> @@ -66,7 +66,9 @@ struct bpf_test {
> int fixup_map1[MAX_FIXUPS];
> int fixup_map2[MAX_FIXUPS];
> int fixup_map3[MAX_FIXUPS];
> -   int fixup_prog[MAX_FIXUPS];
> +   int fixup_map4[MAX_FIXUPS];
> +   int fixup_prog1[MAX_FIXUPS];
> +   int fixup_prog2[MAX_FIXUPS];
> int fixup_map_in_map[MAX_FIXUPS];
> const char *errstr;
> const char *errstr_unpriv;
> @@ -2769,7 +2771,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 0),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .errstr_unpriv = "R3 leaks addr into helper",
> .result_unpriv = REJECT,
> .result = ACCEPT,
> @@ -2856,7 +2858,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 42,
> },
> @@ -2870,7 +2872,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 41,
> },
> @@ -2884,7 +2886,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 1,
> },
> @@ -2898,7 +2900,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 2),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 2,
> },
> @@ -2912,7 +2914,7 

[PATCH bpf-next] bpf: Drop mpls from bpf_fib_lookup

2018-05-29 Thread dsahern
From: David Ahern 

MPLS support will not be submitted this dev cycle, but in working on it
I do see a few changes are needed to the API. For now, drop mpls from the
API. Since the fields in question are unions, the mpls fields can be added
back later without affecting the uapi.

Signed-off-by: David Ahern 
---
 include/uapi/linux/bpf.h | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc68787f2d97..2dd440e39802 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1855,10 +1855,10 @@ union bpf_attr {
  * If lookup is successful and result shows packet is to be
  * forwarded, the neighbor tables are searched for the nexthop.
  * If successful (ie., FIB lookup shows forwarding and nexthop
- * is resolved), the nexthop address is returned in ipv4_dst,
- * ipv6_dst or mpls_out based on family, smac is set to mac
- * address of egress device, dmac is set to nexthop mac address,
- * rt_metric is set to metric from route.
+ * is resolved), the nexthop address is returned in ipv4_dst
+ * or ipv6_dst based on family, smac is set to mac address of
+ * egress device, dmac is set to nexthop mac address, rt_metric
+ * is set to metric from route (IPv4/IPv6 only).
  *
  * *plen* argument is the size of the passed in struct.
  * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
@@ -2538,8 +2538,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-   /* input */
-   __u8family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+   /* input:  network family for lookup (AF_INET, AF_INET6)
+* output: network family of egress nexthop
+*/
+   __u8family;
 
/* set if lookup is to consider L4 data - e.g., FIB rules */
__u8l4_protocol;
@@ -2555,22 +2557,20 @@ struct bpf_fib_lookup {
__u8tos;/* AF_INET  */
__be32  flowlabel;  /* AF_INET6 */
 
-   /* output: metric of fib result */
-   __u32 rt_metric;
+   /* output: metric of fib result (IPv4/IPv6 only) */
+   __u32   rt_metric;
};
 
union {
-   __be32  mpls_in;
__be32  ipv4_src;
__u32   ipv6_src[4];  /* in6_addr; network order */
};
 
-   /* input to bpf_fib_lookup, *dst is destination address.
-* output: bpf_fib_lookup sets to gateway address
+   /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+* network header. output: bpf_fib_lookup sets to gateway address
+* if FIB lookup returns gateway route
 */
union {
-   /* return for MPLS lookups */
-   __be32  mpls_out[4];  /* support up to 4 labels */
__be32  ipv4_dst;
__u32   ipv6_dst[4];  /* in6_addr; network order */
};
-- 
2.11.0



Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Xin Long
On Wed, May 30, 2018 at 1:06 AM, Marcelo Ricardo Leitner
 wrote:
> On Tue, May 29, 2018 at 12:03:46PM -0400, Neal Cardwell wrote:
>> On Tue, May 29, 2018 at 11:45 AM Marcelo Ricardo Leitner <
>> marcelo.leit...@gmail.com> wrote:
>> > - patch2 - fix rtx attack vector
>> >- Add the floor value to rto_min to HZ/20 (which fits the values
>> >  that Michael shared on the other email)
>>
>> I would encourage allowing minimum RTO values down to 5ms, if the ACK
>> policy in the receiver makes this feasible. Our experience is that in
>> datacenter environments it can be advantageous to allow timer-based loss
>> recoveries using timeout values as low as 5ms, e.g.:
>
> Thanks Neal. On Xin's tests, the hearbeat timer becomes an issue at
> ~25ms already. Xin, can you share more details on the hw, which CPU
> was used?
It was on a KVM guest,  "-smp 2,cores=1,threads=1,sockets=2"
# lscpu
Architecture:  x86_64
CPU op-mode(s):32-bit, 64-bit
Byte Order:Little Endian
CPU(s):2
On-line CPU(s) list:   0,1
Thread(s) per core:1
Core(s) per socket:1
Socket(s): 2
NUMA node(s):  1
Vendor ID: GenuineIntel
CPU family:6
Model: 13
Model name:QEMU Virtual CPU version 1.5.3
Stepping:  3
CPU MHz:   2397.222
BogoMIPS:  4794.44
Hypervisor vendor: KVM
Virtualization type:   full
L1d cache: 32K
L1i cache: 32K
L2 cache:  4096K
NUMA node0 CPU(s): 0,1
Flags: fpu de pse tsc msr pae mce cx8 apic sep mtrr
pge mca cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good
nopl cpuid pni cx16 hypervisor lahf_lm abm pti

If we're counting on max_t to fix this CPU stuck. It should not that
matter if min rto < the value causing that stuck.

>
> Anyway, what about we add a floor to rto_max too, so that RTO can
> actually grow into something bigger that don't hog the CPU? Like:
> rto_min floor = 5ms
> rto_max floor = 50ms
>
>   Marcelo


Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo

2018-05-29 Thread Jesper Dangaard Brouer
On Mon, 28 May 2018 02:43:37 +0200
Daniel Borkmann  wrote:

> Its trivial and straight forward to expose it for scripts that can
> then use it along with bpftool in order to inspect an individual
> application's used maps and progs. Right now we dump some basic
> information in the fdinfo file but with the help of the map/prog
> id full introspection becomes possible now.
> 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 
> ---

AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
if this change is handled gracefully by tc ?

>  kernel/bpf/syscall.c | 12 
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 388d4fe..79341e8 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -326,13 +326,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, 
> struct file *filp)
>  "value_size:\t%u\n"
>  "max_entries:\t%u\n"
>  "map_flags:\t%#x\n"
> -"memlock:\t%llu\n",
> +"memlock:\t%llu\n"
> +"map_id:\t%u\n",
>  map->map_type,
>  map->key_size,
>  map->value_size,
>  map->max_entries,
>  map->map_flags,
> -map->pages * 1ULL << PAGE_SHIFT);
> +map->pages * 1ULL << PAGE_SHIFT,
> +map->id);
>  
>   if (owner_prog_type) {
>   seq_printf(m, "owner_prog_type:\t%u\n",
> @@ -1069,11 +1071,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, 
> struct file *filp)
>  "prog_type:\t%u\n"
>  "prog_jited:\t%u\n"
>  "prog_tag:\t%s\n"
> -"memlock:\t%llu\n",
> +"memlock:\t%llu\n"
> +"prog_id:\t%u\n",
>  prog->type,
>  prog->jited,
>  prog_tag,
> -prog->pages * 1ULL << PAGE_SHIFT);
> +prog->pages * 1ULL << PAGE_SHIFT,
> +prog->aux->id);
>  }
>  #endif
>  



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next 05/11] bpf: avoid retpoline for lookup/update/delete calls on maps

2018-05-29 Thread Jesper Dangaard Brouer
On Mon, 28 May 2018 02:43:38 +0200
Daniel Borkmann  wrote:

> While some of the BPF map lookup helpers provide a ->map_gen_lookup()
> callback for inlining the map lookup altogether it is not available
> for every map, so the remaining ones have to call bpf_map_lookup_elem()
> helper which does a dispatch to map->ops->map_lookup_elem(). In
> times of retpolines, this will control and trap speculative execution
> rather than letting it do its work for the indirect call and will
> therefore cause a slowdown. Likewise, bpf_map_update_elem() and
> bpf_map_delete_elem() do not have an inlined version and need to call
> into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
> handlers.
> 
> Before:
> 
>   # bpftool p d x i 1

I would really appreciate if we can use the long options in these kind
of examples.  It makes the command "self-documenting" and searchable by
google.

Here it would be:

 # bpftool prog dump xlated id 1

> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#232656
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
>16: (95) exit helper
> 
> After:
> 
>   # bpftool p d x i 1

Same here

> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#233328
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
>16: (95) exit
> 


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next 03/11] bpf: fixup error message from gpl helpers on license mismatch

2018-05-29 Thread Jesper Dangaard Brouer
On Mon, 28 May 2018 02:43:36 +0200
Daniel Borkmann  wrote:

> Stating 'proprietary program' in the error is just silly since it
> can also be a different open source license than that which is just
> not compatible.
> 
> Reference: https://twitter.com/majek04/status/998531268039102465
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Jesper Dangaard Brouer 

Thank you for cleaning up this confusion :-)

> ---
>  kernel/bpf/verifier.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1fd9667b..4f4786e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2462,7 +2462,7 @@ static int check_helper_call(struct bpf_verifier_env 
> *env, int func_id, int insn
>  
>   /* eBPF programs must be GPL compatible to use GPL-ed functions */
>   if (!env->prog->gpl_compatible && fn->gpl_only) {
> - verbose(env, "cannot call GPL only function from proprietary 
> program\n");
> + verbose(env, "cannot call GPL-restricted function from non-GPL 
> compatible program\n");
>   return -EINVAL;
>   }
>  



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner
On Tue, May 29, 2018 at 12:03:46PM -0400, Neal Cardwell wrote:
> On Tue, May 29, 2018 at 11:45 AM Marcelo Ricardo Leitner <
> marcelo.leit...@gmail.com> wrote:
> > - patch2 - fix rtx attack vector
> >- Add the floor value to rto_min to HZ/20 (which fits the values
> >  that Michael shared on the other email)
> 
> I would encourage allowing minimum RTO values down to 5ms, if the ACK
> policy in the receiver makes this feasible. Our experience is that in
> datacenter environments it can be advantageous to allow timer-based loss
> recoveries using timeout values as low as 5ms, e.g.:

Thanks Neal. On Xin's tests, the hearbeat timer becomes an issue at
~25ms already. Xin, can you share more details on the hw, which CPU
was used?

Anyway, what about we add a floor to rto_max too, so that RTO can
actually grow into something bigger that don't hog the CPU? Like:
rto_min floor = 5ms
rto_max floor = 50ms

  Marcelo


[PATCH v2 net-next] net: remove bypassed check in sch_direct_xmit()

2018-05-29 Thread Song Liu
Checking netif_xmit_frozen_or_stopped() at the end of sch_direct_xmit()
is being bypassed. This is because "ret" from sch_direct_xmit() will be
either NETDEV_TX_OK or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0
will reach the condition:

if (ret && netif_xmit_frozen_or_stopped(txq))
return false;

This patch cleans up the code by removing the whole condition.

For more discussion about this, please refer to
   https://marc.info/?t=15272719578

Signed-off-by: Song Liu 
Cc: John Fastabend 
Cc: Alexei Starovoitov 
Cc: David S. Miller 
---
 net/sched/sch_generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 760ab1b..69078c8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return false;
}
 
-   if (ret && netif_xmit_frozen_or_stopped(txq))
-   return false;
-
return true;
 }
 
-- 
2.9.5



Re: [PATCH net-next] net: remove bypassed check in sch_direct_xmit()

2018-05-29 Thread Song Liu



> On May 29, 2018, at 1:58 AM, Sergei Shtylyov 
>  wrote:
> 
> Hello!
> 
> On 5/29/2018 12:36 AM, Song Liu wrote:
> 
>> Check sch_direct_xmit() at the end of sch_direct_xmit() will be bypassed.
> 
>   "Checking netif_xmit_frozen_or_stopped()", perhaps? Else it doesn't make 
> much sense...

Thanks Sergei!

Sending v2 with fix. 

Song

> 
>> This is because "ret" from sch_direct_xmit() will be either NETDEV_TX_OK
>> or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0 will reach the
>> condition:
>> if (ret && netif_xmit_frozen_or_stopped(txq))
>> return false;
>> This patch cleans up the code by removing  the whole condition.
>> For more discussion about this, please refer to
>>https://marc.info/?t=15272719578
>> Signed-off-by: Song Liu 
>> Cc: John Fastabend 
>> Cc: Alexei Starovoitov 
>> Cc: David S. Miller 
> [...]
> 
> MBR, Sergei




Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-29 Thread Song Liu



> On May 29, 2018, at 7:02 AM, David Miller  wrote:
> 
> From: Song Liu 
> Date: Fri, 25 May 2018 11:11:44 -0700
> 
>> Summary:
>> 
>> At the end of sch_direct_xmit(), we are in the else path of
>> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
>> condition will always fail and netif_xmit_frozen_or_stopped() is not
>> checked at all.
>> 
>>if (ret && netif_xmit_frozen_or_stopped(txq))
>> return false;
>> 
>> In this patch, this condition is fixed as:
>> 
>>if (netif_xmit_frozen_or_stopped(txq))
>> return false;
>> 
>> and further simplifies the code as:
>> 
>>return !netif_xmit_frozen_or_stopped(txq);
>> 
>> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
>> xmit path")
>> Cc: John Fastabend 
>> Cc: David S. Miller 
>> Signed-off-by: Song Liu 
> 
> I expect a new version of this patch which removes the test entirely.

The new version of it is here: http://patchwork.ozlabs.org/patch/921708/

Thanks,
Song



Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Neal Cardwell
On Tue, May 29, 2018 at 11:45 AM Marcelo Ricardo Leitner <
marcelo.leit...@gmail.com> wrote:
> - patch2 - fix rtx attack vector
>- Add the floor value to rto_min to HZ/20 (which fits the values
>  that Michael shared on the other email)

I would encourage allowing minimum RTO values down to 5ms, if the ACK
policy in the receiver makes this feasible. Our experience is that in
datacenter environments it can be advantageous to allow timer-based loss
recoveries using timeout values as low as 5ms, e.g.:


https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
   https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00

cheers,
neal


Re: [PATCH bpf v2 0/5] fix test_sockmap

2018-05-29 Thread John Fastabend
On 05/27/2018 09:37 PM, Prashant Bhole wrote:
> This series fixes error handling, timeout and data verification in
> test_sockmap. Previously it was not able to detect failure/timeout in
> RX/TX thread because error was not notified to the main thread.
> 
> Also slightly improved test output by printing parameter values (cork,
> apply, start, end) so that parameters for all tests are displayed.
> 
> Prashant Bhole (5):
>   selftests/bpf: test_sockmap, check test failure
>   selftests/bpf: test_sockmap, join cgroup in selftest mode
>   selftests/bpf: test_sockmap, fix test timeout
>   selftests/bpf: test_sockmap, fix data verification
>   selftests/bpf: test_sockmap, print additional test options
> 
>  tools/testing/selftests/bpf/test_sockmap.c | 76 +-
>  1 file changed, 58 insertions(+), 18 deletions(-)
> 

After first patch "check test failure" how do we handle the case
where test is known to cause timeouts because we are specifically testing
these cases. This is the 'cork' parameter we discussed in the last
series. It looks like with this series the test may still throw an
error?

Thanks,
John


Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner
On Tue, May 29, 2018 at 03:06:06PM +0200, Michael Tuexen wrote:
> > On 29. May 2018, at 13:41, Neil Horman  wrote:
> > 
> > On Mon, May 28, 2018 at 04:43:15PM -0300, Marcelo Ricardo Leitner wrote:
> >> On Sat, May 26, 2018 at 09:01:00PM -0400, Neil Horman wrote:
> >>> On Sat, May 26, 2018 at 05:50:39PM +0200, Dmitry Vyukov wrote:
>  On Sat, May 26, 2018 at 5:42 PM, Michael Tuexen
>   wrote:
> >> On 25. May 2018, at 21:13, Neil Horman  wrote:
> >> 
> >> On Sat, May 26, 2018 at 01:41:02AM +0800, Xin Long wrote:
> >>> syzbot reported a rcu_sched self-detected stall on CPU which is caused
> >>> by too small value set on rto_min with SCTP_RTOINFO sockopt. With this
> >>> value, hb_timer will get stuck there, as in its timer handler it 
> >>> starts
> >>> this timer again with this value, then goes to the timer handler 
> >>> again.
> >>> 
> >>> This problem is there since very beginning, and thanks to Eric for the
> >>> reproducer shared from a syzbot mail.
> >>> 
> >>> This patch fixes it by not allowing to set rto_min with a value below
> >>> 200 msecs, which is based on TCP's, by either setsockopt or sysctl.
> >>> 
> >>> Reported-by: syzbot+3dcd59a1f907245f8...@syzkaller.appspotmail.com
> >>> Suggested-by: Marcelo Ricardo Leitner 
> >>> Signed-off-by: Xin Long 
> >>> ---
> >>> include/net/sctp/constants.h |  1 +
> >>> net/sctp/socket.c| 10 +++---
> >>> net/sctp/sysctl.c|  3 ++-
> >>> 3 files changed, 10 insertions(+), 4 deletions(-)
> >>> 
> >>> diff --git a/include/net/sctp/constants.h 
> >>> b/include/net/sctp/constants.h
> >>> index 20ff237..2ee7a7b 100644
> >>> --- a/include/net/sctp/constants.h
> >>> +++ b/include/net/sctp/constants.h
> >>> @@ -277,6 +277,7 @@ enum { SCTP_MAX_GABS = 16 };
> >>> #define SCTP_RTO_INITIAL (3 * 1000)
> >>> #define SCTP_RTO_MIN (1 * 1000)
> >>> #define SCTP_RTO_MAX (60 * 1000)
> >>> +#define SCTP_RTO_HARD_MIN   200
> >>> 
> >>> #define SCTP_RTO_ALPHA  3   /* 1/8 when converted to right 
> >>> shifts. */
> >>> #define SCTP_RTO_BETA   2   /* 1/4 when converted to right 
> >>> shifts. */
> >>> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >>> index ae7e7c6..6ef12c7 100644
> >>> --- a/net/sctp/socket.c
> >>> +++ b/net/sctp/socket.c
> >>> @@ -3029,7 +3029,8 @@ static int sctp_setsockopt_nodelay(struct sock 
> >>> *sk, char __user *optval,
> >>> * be changed.
> >>> *
> >>> */
> >>> -static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user 
> >>> *optval, unsigned int optlen)
> >>> +static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user 
> >>> *optval,
> >>> +   unsigned int optlen)
> >>> {
> >>> struct sctp_rtoinfo rtoinfo;
> >>> struct sctp_association *asoc;
> >>> @@ -3056,10 +3057,13 @@ static int sctp_setsockopt_rtoinfo(struct 
> >>> sock *sk, char __user *optval, unsigne
> >>> else
> >>> rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;
> >>> 
> >>> -if (rto_min)
> >>> +if (rto_min) {
> >>> +if (rto_min < SCTP_RTO_HARD_MIN)
> >>> +return -EINVAL;
> >>> rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min;
> >>> -else
> >>> +} else {
> >>> rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min;
> >>> +}
> >>> 
> >>> if (rto_min > rto_max)
> >>> return -EINVAL;
> >>> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> >>> index 33ca5b7..7ec854a 100644
> >>> --- a/net/sctp/sysctl.c
> >>> +++ b/net/sctp/sysctl.c
> >>> @@ -52,6 +52,7 @@ static int rto_alpha_min = 0;
> >>> static int rto_beta_min = 0;
> >>> static int rto_alpha_max = 1000;
> >>> static int rto_beta_max = 1000;
> >>> +static int rto_hard_min = SCTP_RTO_HARD_MIN;
> >>> 
> >>> static unsigned long max_autoclose_min = 0;
> >>> static unsigned long max_autoclose_max =
> >>> @@ -116,7 +117,7 @@ static struct ctl_table sctp_net_table[] = {
> >>> .maxlen = sizeof(unsigned int),
> >>> .mode   = 0644,
> >>> .proc_handler   = proc_sctp_do_rto_min,
> >>> -.extra1 = ,
> >>> +.extra1 = _hard_min,
> >>> .extra2 = _net.sctp.rto_max
> >>> },
> >>> {
> >>> --
> >>> 2.1.0
> >>> 
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-sctp" 
> >>> in
> >>> the body of a message to majord...@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>> 
> >> Patch looks fine, you probably want to note this hard 

Re: STMMAC driver with TSO enabled issue

2018-05-29 Thread Bhadram Varka

Hi Jose,

On 5/28/2018 4:26 PM, Jose Abreu wrote:

Hi Bhadram,

On 28-05-2018 10:15, Bhadram Varka wrote:

Hi Jose,

On 5/25/2018 8:02 PM, Jose Abreu wrote:

On 25-05-2018 15:25, Bhadram Varka wrote:

Hi Jose,

On 5/25/2018 7:35 PM, Jose Abreu wrote:

Hi Bhadram,

On 25-05-2018 05:41, Bhadram Varka wrote:

Hi Jose,

On 5/24/2018 3:01 PM, Jose Abreu wrote:

Hi Bhadram,

On 24-05-2018 06:58, Bhadram Varka wrote:


After some time if check Tx descriptor status - then I see
only
below

[..]
[85788.286730] 027 [0x827951b0]: 0xf854f000 0x0 0x16d8
0x9000

index 025 and 026 descriptors processed but not index 027.

At this stage Tx DMA is always in below state -

■ 3'b011: Running (Reading Data from system memory
buffer and queuing it to the Tx buffer (Tx FIFO))


Thats strange, I think the descriptors look okay though. I
will
need the registers values (before the lock) and, if
possible, the
git bisect output.


Attaching the register dump file after the issue observed.
Please check once.



->8-
0x112c = 0x003F
0x11ac = 0x003F
0x122c = 0x003F
0x12ac = 0x003F

0x1130 = 0x003F
0x11b0 = 0x003F
0x1230 = 0x003F
0x12b0 = 0x003F
->8-

This can't be right, it should be DMA_{RX/TX}_SIZE - 1 =
511. Did
you change these values in the code?



Yes. I have changed the descriptor length to 64 - so that
searching for the current descriptor status would be easy.


Ok, it shouldn't impact anything. The only thing I'm remembering
now is that you can have TSO not enabled in all DMA channels (HW
configuration allows this). Please check if TSO in single-queue
works.

TSO works fine if only single queue enabled. I don't see any
limitation from HW side because TSO works fine with other
driver which we received from Synopsys with IP drop.


You need to check with HW team if TSO is enabled for all channels
because you can have TSO channels < DMA channels and there is no
way to confirm this in the registers. Also check if received
driver is routing packets to queue != 0.



Root caused the issue to TxPBL settings. In current configuration driver 
using the TxPBL = 32 which will be fine for single channel but its not 
recommended settings for multi-queue scenario. Recommended setting for 
TxPBL is half of the queue size.


o Total MTL Tx queue size - 16KB
o For multi-queue - total size divided by number of queues -
(16KB/4) = 4KB for each queue.
o So we need to set the TxPBL value so that we can place memory request 
for 2KB from system memory. For this to achieve we need to set TxPBL=16.


Thanks for the help in debugging.

--
Thanks,
Bhadram.


Re: [PATCH bpf-next 06/11] bpf: add bpf_skb_cgroup_id helper

2018-05-29 Thread Daniel Borkmann
On 05/29/2018 02:15 PM, Quentin Monnet wrote:
> Hi Daniel,
> 
> 2018-05-28 02:43 UTC+0200 ~ Daniel Borkmann 
>> Add a new bpf_skb_cgroup_id() helper that allows to retrieve the
>> cgroup id from the skb's socket. This is useful in particular to
>> enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in
>> cgroup v2 by allowing ID based matching on egress. This can in
>> particular be used in combination with applying policy e.g. from
>> map lookups, and also complements the older bpf_skb_under_cgroup()
>> interface. In user space the cgroup id for a given path can be
>> retrieved through the f_handle as demonstrated in [0] recently.
>>
>>   [0] https://lkml.org/lkml/2018/5/22/1190
>>
>> Signed-off-by: Daniel Borkmann 
>> Acked-by: Alexei Starovoitov 
>> ---
>>  include/uapi/linux/bpf.h | 17 -
>>  net/core/filter.c| 29 +++--
>>  2 files changed, 43 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 9b8c6e3..e2853aa 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2004,6 +2004,20 @@ union bpf_attr {
>>   *  direct packet access.
>>   *  Return
>>   *  0 on success, or a negative error in case of failure.
>> + *
>> + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
>> + *  Description
>> + *  Return the cgroup v2 id of the socket associated with the *skb*.
>> + *  This is roughly similar to the **bpf_get_cgroup_classid**\ ()
>> + *  helper for cgroup v1 by providing a tag resp. identifier that
>> + *  can be matched on or used for map lookups e.g. to implement
>> + *  policy. The cgroup v2 id of a given path in the hierarchy is
>> + *  exposed in user space through the f_handle API in order to get
>> + *  to the same 64-bit id.
>> + *
>> + *  This helper can be used on TC egress path, but not on ingress.
> 
> Nitpick: Maybe mention that the kernel must be built with
> CONFIG_SOCK_CGROUP_DATA option for the helper to be available?

Yeah that's fine. I was planning on a minor respin anyway some time today,
so I'll also update the description along with it.

Cheers,
Daniel


Re: [PATCH bpf v2 1/5] selftests/bpf: test_sockmap, check test failure

2018-05-29 Thread John Fastabend
On 05/27/2018 09:37 PM, Prashant Bhole wrote:
> Test failures are not identified because exit code of RX/TX threads
> is not checked. Also threads are not returning correct exit code.
> 
> - Return exit code from threads depending on test execution status
> - In main thread, check the exit code of RX/TX threads
> 
> Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
> Signed-off-by: Prashant Bhole 
> ---

Acked-by: John Fastabend 

>  tools/testing/selftests/bpf/test_sockmap.c | 25 --
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
> b/tools/testing/selftests/bpf/test_sockmap.c
> index eb17fae458e6..34feb74c95c4 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
>   struct msg_stats s = {0};
>   int iov_count = opt->iov_count;
>   int iov_buf = opt->iov_length;
> + int rx_status, tx_status;
>   int cnt = opt->rate;
> - int status;
>  
>   errno = 0;
>  
> @@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
>   rxpid = fork();
>   if (rxpid == 0) {
>   if (opt->drop_expected)
> - exit(1);
> + exit(0);
>  
>   if (opt->sendpage)
>   iov_count = 1;
> @@ -463,7 +463,7 @@ static int sendmsg_test(struct sockmap_options *opt)
>   "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB 
> %fB/s %fGB/s\n",
>   s.bytes_sent, sent_Bps, sent_Bps/giga,
>   s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
> - exit(1);
> + exit(err ? 1 : 0);
>   } else if (rxpid == -1) {
>   perror("msg_loop_rx: ");
>   return errno;
> @@ -491,14 +491,27 @@ static int sendmsg_test(struct sockmap_options *opt)
>   "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB 
> %fB/s %fGB/s\n",
>   s.bytes_sent, sent_Bps, sent_Bps/giga,
>   s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
> - exit(1);
> + exit(err ? 1 : 0);
>   } else if (txpid == -1) {
>   perror("msg_loop_tx: ");
>   return errno;
>   }
>  
> - assert(waitpid(rxpid, , 0) == rxpid);
> - assert(waitpid(txpid, , 0) == txpid);
> + assert(waitpid(rxpid, _status, 0) == rxpid);
> + assert(waitpid(txpid, _status, 0) == txpid);
> + if (WIFEXITED(rx_status)) {
> + err = WEXITSTATUS(rx_status);
> + if (err) {
> + fprintf(stderr, "rx thread exited with err %d. ", err);
> + goto out;
> + }
> + }
> + if (WIFEXITED(tx_status)) {
> + err = WEXITSTATUS(tx_status);
> + if (err)
> + fprintf(stderr, "tx thread exited with err %d. ", err);
> + }
> +out:
>   return err;
>  }
>  
> 



Re: [PATCH v2 net] tun: Fix NULL pointer dereference in XDP redirect

2018-05-29 Thread David Miller
From: Toshiaki Makita 
Date: Mon, 28 May 2018 19:37:49 +0900

> Calling XDP redirection requires bh disabled. Softirq can call another
> XDP function and redirection functions, then the percpu static variable
> ri->map can be overwritten to NULL.
 ...
> v2:
>  - Removed preempt_disable/enable since local_bh_disable will prevent
>preemption as well, feedback from Jason Wang.
> 
> Fixes: 761876c857cb ("tap: XDP support")
> Signed-off-by: Toshiaki Makita 

Applied and queued up for -stable.


Re: [PATCH net] be2net: Fix error detection logic for BE3

2018-05-29 Thread David Miller
From: Suresh Reddy 
Date: Mon, 28 May 2018 01:26:06 -0400

> Check for 0xE00 (RECOVERABLE_ERR) along with ARMFW UE (0x0)
> in be_detect_error() to know whether the error is valid error or not
> 
> Fixes: 673c96e5a ("be2net: Fix UE detection logic for BE3")
> Signed-off-by: Suresh Reddy 

Applied and queued up for -stable.


[PATCH iproute2 v2] ipaddress: strengthen check on 'label' input

2018-05-29 Thread Patrick Talbert
As mentioned in the ip-address man page, an address label must
be equal to the device name or prefixed by the device name
followed by a colon. Currently the only check on this input is
to see if the device name appears at the beginning of the label
string.

This commit adds an additional check to ensure label == dev or
continues with a colon.

Signed-off-by: Patrick Talbert 
Suggested-by: Stephen Hemminger 
---
 ip/ipaddress.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 00da14c..fce2008 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -2040,6 +2040,22 @@ static bool ipaddr_is_multicast(inet_prefix *a)
return false;
 }
 
+static bool is_valid_label(const char *dev, const char *label)
+{
+   char alias[strlen(dev) + 1];
+
+   if (strlen(label) < strlen(dev))
+   return false;
+
+   strcpy(alias, dev);
+   strcat(alias, ":");
+   if (strncmp(label, dev, strlen(dev)) == 0 ||
+   strncmp(label, alias, strlen(alias)) == 0)
+   return true;
+   else
+   return false;
+}
+
 static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 {
struct {
@@ -2174,8 +2190,9 @@ static int ipaddr_modify(int cmd, int flags, int argc, 
char **argv)
fprintf(stderr, "Not enough information: \"dev\" argument is 
required.\n");
return -1;
}
-   if (l && matches(d, l) != 0) {
-   fprintf(stderr, "\"dev\" (%s) must match \"label\" (%s).\n", d, 
l);
+   if (l && ! is_valid_label(d, l)) {
+   fprintf(stderr, "\"label\" (%s) must match \"dev\" (%s) or be 
prefixed by"
+   " \"dev\" with a colon.\n", l, d);
return -1;
}
 
-- 
1.8.3.1



Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-29 Thread Don Dutile

On 05/25/2018 05:05 PM, Jakub Kicinski wrote:

On Fri, 25 May 2018 09:02:23 -0500, Bjorn Helgaas wrote:

On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:

On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:

On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:

Some user space depends on enabling sriov_totalvfs number of VFs
to not fail, e.g.:

$ cat .../sriov_totalvfs > .../sriov_numvfs

For devices which VF support depends on loaded FW we have the
pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
a special "unset" value, meaning drivers can't limit sriov_totalvfs
to 0.  Remove the special values completely and simply initialize
driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
Add a helper for drivers to reset the VF limit back to total.


I still can't really make sense out of the changelog.

I think part of the reason it's confusing is because there are two
things going on:

   1) You want this:
   
pci_sriov_set_totalvfs(dev, 0);

x = pci_sriov_get_totalvfs(dev)

  to return 0 instead of total_VFs.  That seems to connect with
  your subject line.  It means "sriov_totalvfs" in sysfs could be
  0, but I don't know how that is useful (I'm sure it is; just
  educate me :))


Let me just quote the bug report that got filed on our internal bug
tracker :)

   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
   then tries to set that as the sriov_numvfs parameter.

   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
   but it's set to max.  When FW is switched to flower*, the correct
   sriov_totalvfs value is presented.

* flower is a project name


 From the point of view of the PCI core (which knows nothing about
device firmware and relies on the architected config space described
by the PCIe spec), this sounds like an erratum: with some firmware
installed, the device is not capable of SR-IOV, but still advertises
an SR-IOV capability with "TotalVFs > 0".

Regardless of whether that's an erratum, we do allow PF drivers to use
pci_sriov_set_totalvfs() to limit the number of VFs that may be
enabled by writing to the PF's "sriov_numvfs" sysfs file.


Think more of an FPGA which can be reprogrammed at runtime to have
different capabilities than an erratum.  Some FWs simply have no use
for VFs and save resources (and validation time) by not supporting it.


Sure, then the steps should be:
a) (re-)program FPGA
b) invoke hot-plug for new device.
   -- by default, VFs aren't configured(enabled) in a Linux kernel;
  -- some drivers provide boot-time enablement, but that becomes
 system-wide, and can cause major config issues when multiples of a 
device are installed in the system.
  -- otherwise, configure via sysfs
   -- this should clear/reset the VF values too.


But the current implementation does not allow a PF driver to limit VFs
to 0, and that does seem nonsensical.


My understanding is OpenStack uses sriov_totalvfs to determine how many
VFs can be enabled, looks like this is the code:

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
   

   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
  sure what you intend for this.  Is *every* driver supposed to
  call it in .remove()?  Could/should this be done in the core
  somehow instead of depending on every driver?


Good question, I was just thinking yesterday we may want to call it
from the core, but I don't think it's strictly necessary nor always
sufficient (we may reload FW without re-probing).

We have a device which supports different number of VFs based on the FW
loaded.  Some legacy FWs does not inform the driver how many VFs it can
support, because it supports max.  So the flow in our driver is this:

load_fw(dev);
...
max_vfs = ask_fw_for_max_vfs(dev);
if (max_vfs >= 0)
return pci_sriov_set_totalvfs(dev, max_vfs);
else /* FW didn't tell us, assume max */
return pci_sriov_reset_totalvfs(dev);

We also reset the max on device remove, but that's not strictly
necessary.

Other users of pci_sriov_set_totalvfs() always know the value to set
the total to (either always get it from FW or it's a constant).

If you prefer we can work out the correct max for those legacy cases in
the driver as well, although it seemed cleaner to just ask the core,
since it already has total_VFs value handy :)
   

I'm also having a hard time connecting your user-space command example
with the rest of this.  Maybe it will make more sense to me tomorrow
after some coffee.


OpenStack assumes it will always be able to set sriov_numvfs to
sriov_totalvfs, see this 'if':

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512


Thanks for educating me.  I think there are two issues here that we
can separate.  I extracted 

Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0

2018-05-29 Thread Don Dutile

On 05/25/2018 04:46 PM, Bjorn Helgaas wrote:

On Fri, May 25, 2018 at 03:27:52PM -0400, Don Dutile wrote:

On 05/25/2018 10:02 AM, Bjorn Helgaas wrote:

On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:

Hi Bjorn!

On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:

On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:

Some user space depends on enabling sriov_totalvfs number of VFs
to not fail, e.g.:

$ cat .../sriov_totalvfs > .../sriov_numvfs

For devices which VF support depends on loaded FW we have the
pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
a special "unset" value, meaning drivers can't limit sriov_totalvfs
to 0.  Remove the special values completely and simply initialize
driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
Add a helper for drivers to reset the VF limit back to total.


I still can't really make sense out of the changelog.

I think part of the reason it's confusing is because there are two
things going on:

1) You want this:
 pci_sriov_set_totalvfs(dev, 0);
 x = pci_sriov_get_totalvfs(dev)

   to return 0 instead of total_VFs.  That seems to connect with
   your subject line.  It means "sriov_totalvfs" in sysfs could be
   0, but I don't know how that is useful (I'm sure it is; just
   educate me :))


Let me just quote the bug report that got filed on our internal bug
tracker :)

When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
then tries to set that as the sriov_numvfs parameter.

For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0,
but it's set to max.  When FW is switched to flower*, the correct
sriov_totalvfs value is presented.

* flower is a project name


  From the point of view of the PCI core (which knows nothing about
device firmware and relies on the architected config space described
by the PCIe spec), this sounds like an erratum: with some firmware
installed, the device is not capable of SR-IOV, but still advertises
an SR-IOV capability with "TotalVFs > 0".

Regardless of whether that's an erratum, we do allow PF drivers to use
pci_sriov_set_totalvfs() to limit the number of VFs that may be
enabled by writing to the PF's "sriov_numvfs" sysfs file.


+1.


But the current implementation does not allow a PF driver to limit VFs
to 0, and that does seem nonsensical.


Well, not really -- claiming to support VFs, and then wanting it to be 0...
I could certainly argue is non-sensical.
 From a sw perspective, sure, see if we can set VFs to 0 (and reset to another 
value later).

/me wishes that implementers would follow the architecture vs torquing it into 
strange shapes.


My understanding is OpenStack uses sriov_totalvfs to determine how many
VFs can be enabled, looks like this is the code:

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464


2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
   sure what you intend for this.  Is *every* driver supposed to
   call it in .remove()?  Could/should this be done in the core
   somehow instead of depending on every driver?


Good question, I was just thinking yesterday we may want to call it
from the core, but I don't think it's strictly necessary nor always
sufficient (we may reload FW without re-probing).

We have a device which supports different number of VFs based on the FW
loaded.  Some legacy FWs does not inform the driver how many VFs it can
support, because it supports max.  So the flow in our driver is this:

load_fw(dev);
...
max_vfs = ask_fw_for_max_vfs(dev);
if (max_vfs >= 0)
return pci_sriov_set_totalvfs(dev, max_vfs);
else /* FW didn't tell us, assume max */
return pci_sriov_reset_totalvfs(dev);

We also reset the max on device remove, but that's not strictly
necessary.

Other users of pci_sriov_set_totalvfs() always know the value to set
the total to (either always get it from FW or it's a constant).

If you prefer we can work out the correct max for those legacy cases in
the driver as well, although it seemed cleaner to just ask the core,
since it already has total_VFs value handy :)


I'm also having a hard time connecting your user-space command example
with the rest of this.  Maybe it will make more sense to me tomorrow
after some coffee.


OpenStack assumes it will always be able to set sriov_numvfs to
sriov_totalvfs, see this 'if':

http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512


Thanks for educating me.  I think there are two issues here that we
can separate.  I extracted the patch below for the first.

The second is the question of resetting driver_max_VFs.  I think we
currently have a general issue in the core:

- load PF driver 1
- driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
- unload PF 

Re: [PATCH net-next 0/7] net: Add address attribute to control metric of prefix route

2018-05-29 Thread David Miller
From: dsah...@kernel.org
Date: Sun, 27 May 2018 08:09:52 -0700

> For use cases such as VRR (Virtual Router Redundancy) interface managers
> want efficient control over the order of prefix routes when multiple
> interfaces have addresses with overlapping/duplicate subnets.
> 
> Currently, if two interfaces have addresses in the same subnet, the order
> of the prefix route entries is determined by the order in which the
> addresses are assigned or the links brought up. Any actions like cycling
> an interface up and down changes that order. This set adds a new attribute
> for addresses to allow a user to specify the metric of the prefix route
> associated with an address giving interface managers better and more
> efficient control of the order of prefix routes.

Looks great, series applied, thanks David.


Re: [PATCH net-next 0/3] mlxsw: use MRSR register for FW reset

2018-05-29 Thread David Miller
From: Ido Schimmel 
Date: Sun, 27 May 2018 09:56:12 +0300

> Jiri says:
> 
> Introduce a MRSR register definition and use it to do FW reset instead
> of existing mechanism using PCI BAR0 register.

Series applied to net-next.


Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress

2018-05-29 Thread John Hurley
On Sat, May 26, 2018 at 3:47 AM, Jakub Kicinski
 wrote:
> On Fri, 25 May 2018 08:48:09 +0200, Jiri Pirko wrote:
>> Thu, May 24, 2018 at 04:22:47AM CEST, jakub.kicin...@netronome.com wrote:
>> >Hi!
>> >
>> >This series from John adds bond offload to the nfp driver.  Patch 5
>> >exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
>> >hashing matches that of the software LAG.  This may be unnecessarily
>> >conservative, let's see what LAG maintainers think :)
>>
>> So you need to restrict offload to only certain hash algo? In mlxsw, we
>> just ignore the lag setting and do some hw default hashing. Would not be
>> enough? Note that there's a good reason for it, as you see, in team, the
>> hashing is done in a BPF function and could be totally arbitrary.
>> Your patchset effectively disables team offload for nfp.
>
> My understanding is that the project requirements only called for L3/L4
> hash algorithm offload, hence the temptation to err on the side of
> caution and not offload all the bond configurations.  John can provide
> more details.  Not being able to offload team is unfortunate indeed.

Hi Jiri,
Yes, as Jakub mentions, we restrict ourselves to L3/L4 hash algorithm
as this is currently what is supported in fw.
Hopefully this will change as fw features are expanded.
I understand the issue this presents with offloading team.
Perhaps resorting to a default hw hash for team is acceptable.
John


Re: [PATCH net] mlxsw: spectrum: Forbid creation of VLAN 1 over port/LAG

2018-05-29 Thread David Miller
From: Ido Schimmel 
Date: Sun, 27 May 2018 09:48:41 +0300

> From: Petr Machata 
> 
> VLAN 1 is internally used for untagged traffic. Prevent creation of
> explicit netdevice for that VLAN, because that currently isn't supported
> and leads to the NULL pointer dereference cited below.
> 
> Fix by preventing creation of VLAN devices with VID of 1 over mlxsw
> devices or LAG devices that involve mlxsw devices.
 ...
> Fixes: 9589a7b5d7d9 ("mlxsw: spectrum: Handle VLAN devices linking / 
> unlinking")
> Suggested-by: Ido Schimmel 
> Signed-off-by: Petr Machata 
> Signed-off-by: Ido Schimmel 

Applied and queued up for -stable.


Re: [PATCH] net: qcom/emac: fix device tree initialization

2018-05-29 Thread David Miller
From: Timur Tabi 
Date: Sat, 26 May 2018 20:29:14 -0500

> Commit "net: qcom/emac: Encapsulate sgmii ops under one structure"
> introduced the sgmii_ops structure, but did not correctly initialize
> it on device tree platforms.  This resulted in compiler warnings when
> ACPI is not enabled.
> 
> Reported-by: Arnd Bergmann 
> Signed-off-by: Timur Tabi 

Applied to net-next, thank you.


Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-29 Thread David Miller
From: Song Liu 
Date: Fri, 25 May 2018 11:11:44 -0700

> Summary:
> 
> At the end of sch_direct_xmit(), we are in the else path of
> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
> condition will always fail and netif_xmit_frozen_or_stopped() is not
> checked at all.
> 
> if (ret && netif_xmit_frozen_or_stopped(txq))
>  return false;
> 
> In this patch, this condition is fixed as:
> 
> if (netif_xmit_frozen_or_stopped(txq))
>  return false;
> 
> and further simplifies the code as:
> 
> return !netif_xmit_frozen_or_stopped(txq);
> 
> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
> xmit path")
> Cc: John Fastabend 
> Cc: David S. Miller 
> Signed-off-by: Song Liu 

I expect a new version of this patch which removes the test entirely.


Re: [PATCH net-next 00/14] nfp: abm: RED/MQ qdisc offload

2018-05-29 Thread David Miller
From: Jakub Kicinski 
Date: Fri, 25 May 2018 21:53:24 -0700

> This is second batch of advanced buffer management nfp driver
> changes.  This series adds the qdisc offload.  Support for
> a very simple subset of RED qdisc offload is added as needed
> for DCTCP ECN marking (min and max thresholds set to the same
> value).
> 
> The first two patches fix glitches introduced by the previous
> series.  We have to be careful about phys_port_name handling,
> because VFs share the same code path, and some user space may
> get confused by the names we chose.
> 
> Since unlike previous offloads we can report the queue backlog
> both in bytes and packets we need to adjust how statistics are
> added up in the core (patch 6).
> 
> There are some extra statistics we want to expose which don't
> fit into TC stats, namely counts of packets which have been fast-
> -forwarded without getting enqueued because there was no
> contention and number of packets that were ever queued (sum of
> all momentary backlogs).  We expose those through ethtool stats
> (patches 8 and 9).
> 
> Remaining 5 patches add MQ offload - to be able to set different
> configurations on different queues.  Representors are made multi-
> -queue and we add offload support to MQ.  MQ stats are added up
> before calling ->dump qdiscs on the children, and therefore don't
> include updated offload values.  To avoid clearly incorrect stats
> MQ is made to also request stats update from offloads.  This way
> we can correct the diff at the driver level.

Series applied, thanks Jakub.


Re: [pull request][for-next 00/12] Mellanox, mlx5e updates 2018-05-25

2018-05-29 Thread David Miller
From: Saeed Mahameed 
Date: Fri, 25 May 2018 17:01:55 -0700

> This is a mlx5e only pull request, for more information please see tag
> log below.
> 
> Please pull and let me know if there's any problem.

Pulled, thanks Saeed.

There was a minor conflict to resolve (simple overlapping changes).


[PATCH rdma-next v2 08/13] IB/core: Add support for flow counters

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

A counters object could be attached to flow on creation
by providing the counter specification action.

General counters description which count packets and bytes are
introduced, downstream patches from this series will use them
as part of flow counters binding.

In addition, increase number of flow specifications supported
layers to 10 upon adding count specification and for the
previously added drop specification.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 include/rdma/ib_verbs.h | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 80956b1c9f4d..3acf7a9fa452 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1859,9 +1859,10 @@ enum ib_flow_spec_type {
IB_FLOW_SPEC_ACTION_TAG = 0x1000,
IB_FLOW_SPEC_ACTION_DROP= 0x1001,
IB_FLOW_SPEC_ACTION_HANDLE  = 0x1002,
+   IB_FLOW_SPEC_ACTION_COUNT   = 0x1003,
 };
 #define IB_FLOW_SPEC_LAYER_MASK0xF0
-#define IB_FLOW_SPEC_SUPPORT_LAYERS 8
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 10

 /* Flow steering rule priority is set according to it's domain.
  * Lower domain value means higher priority.
@@ -2041,6 +2042,17 @@ struct ib_flow_spec_action_handle {
struct ib_flow_action*act;
 };

+enum ib_counters_description {
+   IB_COUNTER_PACKETS,
+   IB_COUNTER_BYTES,
+};
+
+struct ib_flow_spec_action_count {
+   enum ib_flow_spec_type type;
+   u16 size;
+   struct ib_counters *counters;
+};
+
 union ib_flow_spec {
struct {
u32 type;
@@ -2058,6 +2070,7 @@ union ib_flow_spec {
struct ib_flow_spec_action_tag  flow_tag;
struct ib_flow_spec_action_drop drop;
struct ib_flow_spec_action_handle action;
+   struct ib_flow_spec_action_count flow_count;
 };

 struct ib_flow_attr {
--
2.14.3



[PATCH rdma-next v2 05/13] IB/core: Introduce counters read verb

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

The user supplies counters instance and a reference to an output
array of uint64_t.
The driver reads the hardware counters values and writes them to
the output index location in the user supplied array.
All counters values are represented as uint64_t types.

To be able to successfully read the data the counters must be
first bound to an IB object.

Downstream patches will present binding method for
flow counters.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 include/rdma/ib_verbs.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ce3d39725966..f6bd3b97b971 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2219,6 +2219,17 @@ struct ib_counters {
atomic_tusecnt;
 };

+enum ib_read_counters_flags {
+   /* prefer read values from driver cache */
+   IB_READ_COUNTERS_ATTR_PREFER_CACHED = 1 << 0,
+};
+
+struct ib_counters_read_attr {
+   u64 *counters_buff;
+   u32 ncounters;
+   u32 flags; /* use enum ib_read_counters_flags */
+};
+
 struct uverbs_attr_bundle;

 struct ib_device {
@@ -2493,6 +2504,9 @@ struct ib_device {
struct ib_counters *(*create_counters)(struct ib_device *device,
   struct uverbs_attr_bundle 
*attrs);
int (*destroy_counters)(struct ib_counters  *counters);
+   int (*read_counters)(struct ib_counters *counters,
+struct ib_counters_read_attr 
*counters_read_attr,
+struct uverbs_attr_bundle *attrs);

/**
 * rdma netdev operation
--
2.14.3



[PATCH mlx5-next v2 02/13] net/mlx5: Export flow counter related API

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

Exports counters API to be used in both IB and EN.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  | 23 --
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |  3 +++
 include/linux/mlx5/fs.h| 22 +
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index b6da322a8016..40992aed1791 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -131,29 +131,6 @@ struct mlx5_flow_table {
struct rhltable fgs_hash;
 };

-struct mlx5_fc_cache {
-   u64 packets;
-   u64 bytes;
-   u64 lastuse;
-};
-
-struct mlx5_fc {
-   struct rb_node node;
-   struct list_head list;
-
-   /* last{packets,bytes} members are used when calculating the delta since
-* last reading
-*/
-   u64 lastpackets;
-   u64 lastbytes;
-
-   u32 id;
-   bool deleted;
-   bool aging;
-
-   struct mlx5_fc_cache cache cacheline_aligned_in_smp;
-};
-
 struct mlx5_ft_underlay_qp {
struct list_head list;
u32 qpn;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index b7ab929d5f8e..10f407843e03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -243,6 +243,7 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, 
bool aging)

return ERR_PTR(err);
 }
+EXPORT_SYMBOL(mlx5_fc_create);

 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
@@ -260,6 +261,7 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct 
mlx5_fc *counter)
mlx5_cmd_fc_free(dev, counter->id);
kfree(counter);
 }
+EXPORT_SYMBOL(mlx5_fc_destroy);

 int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
@@ -317,6 +319,7 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
 {
return mlx5_cmd_fc_query(dev, id, packets, bytes);
 }
+EXPORT_SYMBOL(mlx5_fc_query);

 void mlx5_fc_query_cached(struct mlx5_fc *counter,
  u64 *bytes, u64 *packets, u64 *lastuse)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 9f4d32e41c06..93aab0f055b4 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -186,6 +186,28 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, 
bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
  u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+ u64 *packets, u64 *bytes);
+
+struct mlx5_fc_cache {
+   u64 packets;
+   u64 bytes;
+   u64 lastuse;
+};
+
+struct mlx5_fc {
+   struct rb_node node;
+   struct list_head list;
+
+   u64 lastpackets;
+   u64 lastbytes;
+
+   u32 id;
+   bool deleted;
+   bool aging;
+   struct mlx5_fc_cache cache cacheline_aligned_in_smp;
+};
+
 int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 
underlay_qpn);

--
2.14.3



[PATCH rdma-next v2 13/13] IB/mlx5: Add counters read support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

This patch implements the uverbs counters read API, it will use the
specific read counters function to the given type to accomplish its
task.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/hw/mlx5/main.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index f4da59e39c9e..d775fac9a1ef 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5327,6 +5327,48 @@ static void depopulate_specs_root(struct mlx5_ib_dev 
*dev)
uverbs_free_spec_tree(dev->ib_dev.specs_root);
 }

+static int mlx5_ib_read_counters(struct ib_counters *counters,
+struct ib_counters_read_attr *read_attr,
+struct uverbs_attr_bundle *attrs)
+{
+   struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+   struct mlx5_read_counters_attr mread_attr = {};
+   u32 *desc;
+   int ret, i;
+
+   mutex_lock(>mcntrs_mutex);
+   if (mcounters->cntrs_max_index > read_attr->ncounters) {
+   ret = -EINVAL;
+   goto err_bound;
+   }
+
+   mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
+GFP_KERNEL);
+   if (!mread_attr.out) {
+   ret = -ENOMEM;
+   goto err_bound;
+   }
+
+   mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
+   mread_attr.flags = read_attr->flags;
+   ret = mcounters->read_counters(counters->device, _attr);
+   if (ret)
+   goto err_read;
+
+   /* do the pass over the counters data array to assign according to the
+* descriptions and indexing pairs
+*/
+   desc = mcounters->counters_data;
+   for (i = 0; i < mcounters->ncounters * 2; i += 2)
+   read_attr->counters_buff[desc[i + 1]] += 
mread_attr.out[desc[i]];
+
+err_read:
+   kfree(mread_attr.out);
+err_bound:
+   mutex_unlock(>mcntrs_mutex);
+   return ret;
+}
+
 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
 {
struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
@@ -5600,6 +5642,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
dev->ib_dev.create_counters = mlx5_ib_create_counters;
dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters;
+   dev->ib_dev.read_counters = mlx5_ib_read_counters;

err = init_node_data(dev);
if (err)
--
2.14.3



[PATCH rdma-next v2 09/13] IB/uverbs: Add support for flow counters

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

The struct ib_uverbs_flow_spec_action_count associates
a counters object with the flow.

Post this association the flow counters can be read via
the counters object.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/core/uverbs.h |  1 +
 drivers/infiniband/core/uverbs_cmd.c | 81 +++-
 include/uapi/rdma/ib_user_verbs.h| 13 ++
 3 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 5b2461fa634d..c0d40fc3a53a 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -263,6 +263,7 @@ struct ib_uverbs_flow_spec {
struct ib_uverbs_flow_spec_action_tag   flow_tag;
struct ib_uverbs_flow_spec_action_drop  drop;
struct ib_uverbs_flow_spec_action_handle action;
+   struct ib_uverbs_flow_spec_action_count flow_count;
};
 };

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index ddb9d79691be..3179a95c6f5e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2748,43 +2748,82 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file 
*file,
 struct ib_uflow_resources {
size_t  max;
size_t  num;
-   struct ib_flow_action   *collection[0];
+   size_t  collection_num;
+   size_t  counters_num;
+   struct ib_counters  **counters;
+   struct ib_flow_action   **collection;
 };

 static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
 {
struct ib_uflow_resources *resources;

-   resources =
-   kmalloc(sizeof(*resources) +
-   num_specs * sizeof(*resources->collection), GFP_KERNEL);
+   resources = kzalloc(sizeof(*resources), GFP_KERNEL);

if (!resources)
-   return NULL;
+   goto err_res;
+
+   resources->counters =
+   kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL);
+
+   if (!resources->counters)
+   goto err_cnt;
+
+   resources->collection =
+   kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL);
+
+   if (!resources->collection)
+   goto err_collection;

-   resources->num = 0;
resources->max = num_specs;

return resources;
+
+err_collection:
+   kfree(resources->counters);
+err_cnt:
+   kfree(resources);
+err_res:
+   return NULL;
 }

 void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
 {
unsigned int i;

-   for (i = 0; i < uflow_res->num; i++)
+   for (i = 0; i < uflow_res->collection_num; i++)
atomic_dec(_res->collection[i]->usecnt);

+   for (i = 0; i < uflow_res->counters_num; i++)
+   atomic_dec(_res->counters[i]->usecnt);
+
+   kfree(uflow_res->collection);
+   kfree(uflow_res->counters);
kfree(uflow_res);
 }

 static void flow_resources_add(struct ib_uflow_resources *uflow_res,
-  struct ib_flow_action *action)
+  enum ib_flow_spec_type type,
+  void *ibobj)
 {
WARN_ON(uflow_res->num >= uflow_res->max);

-   atomic_inc(>usecnt);
-   uflow_res->collection[uflow_res->num++] = action;
+   switch (type) {
+   case IB_FLOW_SPEC_ACTION_HANDLE:
+   atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt);
+   uflow_res->collection[uflow_res->collection_num++] =
+   (struct ib_flow_action *)ibobj;
+   break;
+   case IB_FLOW_SPEC_ACTION_COUNT:
+   atomic_inc(&((struct ib_counters *)ibobj)->usecnt);
+   uflow_res->counters[uflow_res->counters_num++] =
+   (struct ib_counters *)ibobj;
+   break;
+   default:
+   WARN_ON(1);
+   }
+
+   uflow_res->num++;
 }

 static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext,
@@ -2821,9 +2860,29 @@ static int kern_spec_to_ib_spec_action(struct 
ib_ucontext *ucontext,
return -EINVAL;
ib_spec->action.size =
sizeof(struct ib_flow_spec_action_handle);
-   flow_resources_add(uflow_res, ib_spec->action.act);
+   flow_resources_add(uflow_res,
+  IB_FLOW_SPEC_ACTION_HANDLE,
+  ib_spec->action.act);
uobj_put_obj_read(ib_spec->action.act);
break;
+   case IB_FLOW_SPEC_ACTION_COUNT:
+   if (kern_spec->flow_count.size !=
+   sizeof(struct ib_uverbs_flow_spec_action_count))
+   return -EINVAL;
+  

[PATCH rdma-next v2 12/13] IB/mlx5: Add flow counters read support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

Implements the flow counters read wrapper.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/hw/mlx5/main.c| 15 +++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 13 -
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index f846956833e5..f4da59e39c9e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3149,6 +3149,19 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev,
}
 }

+static int read_flow_counters(struct ib_device *ibdev,
+ struct mlx5_read_counters_attr *read_attr)
+{
+   struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
+   struct mlx5_ib_dev *dev = to_mdev(ibdev);
+
+   return mlx5_fc_query(dev->mdev, fc->id,
+_attr->out[IB_COUNTER_PACKETS],
+_attr->out[IB_COUNTER_BYTES]);
+}
+
+/* flow counters currently expose two counters packets and bytes */
+#define FLOW_COUNTERS_NUM 2
 static int counters_set_description(struct ib_counters *counters,
enum mlx5_ib_counters_type counters_type,
u32 *desc_data,
@@ -3163,6 +3176,8 @@ static int counters_set_description(struct ib_counters 
*counters,

/* init the fields for the object */
mcounters->type = counters_type;
+   mcounters->read_counters = read_flow_counters;
+   mcounters->counters_num = FLOW_COUNTERS_NUM;
mcounters->ncounters = ncounters;
/* each counter entry have both description and index pair */
for (i = 0; i < ncounters * 2; i += 2) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h 
b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7313d3cd04f0..810557b5a5c1 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -814,6 +814,12 @@ struct mlx5_memic {
DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
 };

+struct mlx5_read_counters_attr {
+   struct mlx5_fc *hw_cntrs_hndl;
+   u64 *out;
+   u32 flags;
+};
+
 enum mlx5_ib_counters_type {
MLX5_IB_COUNTERS_FLOW,
 };
@@ -821,7 +827,12 @@ enum mlx5_ib_counters_type {
 struct mlx5_ib_mcounters {
struct ib_counters ibcntrs;
enum mlx5_ib_counters_type type;
-   void *hw_cntrs_hndl;
+   /* number of counters supported for this counters type */
+   u32 counters_num;
+   struct mlx5_fc *hw_cntrs_hndl;
+   /* read function for this counters type */
+   int (*read_counters)(struct ib_device *ibdev,
+struct mlx5_read_counters_attr *read_attr);
/* max index set as part of create_flow */
u32 cntrs_max_index;
/* number of counters data entries ( pair) */
--
2.14.3



[PATCH rdma-next v2 06/13] IB/uverbs: Add read counters support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

This patch exposes the read counters verb to user space
applications.
By that verb the user can read the hardware counters which
are associated with the counters object.

The application needs to provide a sufficient memory to
hold the statistics.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 .../infiniband/core/uverbs_std_types_counters.c| 59 +-
 include/uapi/rdma/ib_user_ioctl_cmds.h |  7 +++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c 
b/drivers/infiniband/core/uverbs_std_types_counters.c
index a5bc50ceee13..b35fcd3718c8 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -80,6 +80,49 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(struct ib_device *ib_de
return ret;
 }

+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(struct ib_device 
*ib_dev,
+  struct ib_uverbs_file 
*file,
+  struct 
uverbs_attr_bundle *attrs)
+{
+   struct ib_counters_read_attr read_attr = {};
+   const struct uverbs_attr *uattr;
+   struct ib_counters *counters =
+   uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
+   int ret;
+
+   if (!ib_dev->read_counters)
+   return -EOPNOTSUPP;
+
+   if (!atomic_read(>usecnt))
+   return -EINVAL;
+
+   ret = uverbs_copy_from(_attr.flags, attrs,
+  UVERBS_ATTR_READ_COUNTERS_FLAGS);
+   if (ret)
+   return ret;
+
+   uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF);
+   read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64);
+   read_attr.counters_buff = kcalloc(read_attr.ncounters,
+ sizeof(u64), GFP_KERNEL);
+   if (!read_attr.counters_buff)
+   return -ENOMEM;
+
+   ret = ib_dev->read_counters(counters,
+   _attr,
+   attrs);
+   if (ret)
+   goto err_read;
+
+   ret = uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF,
+read_attr.counters_buff,
+read_attr.ncounters * sizeof(u64));
+
+err_read:
+   kfree(read_attr.counters_buff);
+   return ret;
+}
+
 static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE,
_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
 UVERBS_OBJECT_COUNTERS,
@@ -93,8 +136,22 @@ static 
DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY,
 UVERBS_ACCESS_DESTROY,
 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));

+#define MAX_COUNTERS_BUFF_SIZE USHRT_MAX
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_READ,
+   _ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE,
+UVERBS_OBJECT_COUNTERS,
+UVERBS_ACCESS_READ,
+UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+   _ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF,
+UVERBS_ATTR_SIZE(0, MAX_COUNTERS_BUFF_SIZE),
+UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+   _ATTR_PTR_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS,
+   UVERBS_ATTR_TYPE(__u32),
+   UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
_TYPE_ALLOC_IDR(0, uverbs_free_counters),
_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
-   _METHOD(UVERBS_METHOD_COUNTERS_DESTROY));
+   _METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
+   _METHOD(UVERBS_METHOD_COUNTERS_READ));

diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h 
b/include/uapi/rdma/ib_user_ioctl_cmds.h
index c28ce62d2e40..888ac5975a6c 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -140,9 +140,16 @@ enum uverbs_attrs_destroy_counters_cmd_attr_ids {
UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
 };

+enum uverbs_attrs_read_counters_cmd_attr_ids {
+   UVERBS_ATTR_READ_COUNTERS_HANDLE,
+   UVERBS_ATTR_READ_COUNTERS_BUFF,
+   UVERBS_ATTR_READ_COUNTERS_FLAGS,
+};
+
 enum uverbs_methods_actions_counters_ops {
UVERBS_METHOD_COUNTERS_CREATE,
UVERBS_METHOD_COUNTERS_DESTROY,
+   UVERBS_METHOD_COUNTERS_READ,
 };

 #endif
--
2.14.3



[PATCH rdma-next v2 03/13] IB/core: Introduce counters object and its create/destroy

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

A verbs application may need to get statistics and info on various
aspects of a verb object (e.g. Flow, QP, ...), in general case the
application will state which object's counters its interested in
(we refer to this action as attach), bind this new counters object
to the appropriate verb object and on later stage read their values
using the counters object.

This series introduces a general API for counters object that may
accumulate any ib object counters type, bound and read on demand.

Counters instance is allocated on an IB context and belongs to
that context.
Upon successful creation the counters can be bound to a verbs
object so that hardware counter instances can be created and read.

Downstream patches in this series will introduce the attach, bind
and the read functionality.

Counters instance can be de-allocated, upon successful
destruction the related hardware resources are released.

Prior to destroy call the user must first make sure that the counters
is not being used by any IB object, e.g. not attached to any of its
counted type otherwise an EBUSY error is invoked.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 include/rdma/ib_verbs.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e849bd0fc618..ce3d39725966 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2212,6 +2212,13 @@ struct ib_port_pkey_list {
struct list_head  pkey_list;
 };

+struct ib_counters {
+   struct ib_device*device;
+   struct ib_uobject   *uobject;
+   /* num of objects attached */
+   atomic_tusecnt;
+};
+
 struct uverbs_attr_bundle;

 struct ib_device {
@@ -2483,6 +2490,10 @@ struct ib_device {
struct ib_mr * (*reg_dm_mr)(struct ib_pd *pd, struct ib_dm 
*dm,
struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle 
*attrs);
+   struct ib_counters *(*create_counters)(struct ib_device *device,
+  struct uverbs_attr_bundle 
*attrs);
+   int (*destroy_counters)(struct ib_counters  *counters);
+
/**
 * rdma netdev operation
 *
--
2.14.3



[PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

Associates a counters with a flow when IB_FLOW_SPEC_ACTION_COUNT
is part of the flow specifications.

The counters user space placements of location and description
(index, description) pairs are passed as private data of the
counters flow specification.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/hw/mlx5/main.c| 223 ---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  15 +++
 include/linux/mlx5/fs.h  |   1 +
 include/uapi/rdma/mlx5-abi.h |  14 +++
 4 files changed, 239 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index 18bfee86fa52..f846956833e5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2472,7 +2472,7 @@ static int check_mpls_supp_fields(u32 field_support, 
const __be32 *set_mask)
 #define LAST_TUNNEL_FIELD tunnel_id
 #define LAST_FLOW_TAG_FIELD tag_id
 #define LAST_DROP_FIELD size
-#define LAST_DROP_FIELD size
+#define LAST_COUNTERS_FIELD counters

 /* Field is the last supported field */
 #define FIELDS_NOT_SUPPORTED(filter, field)\
@@ -2836,6 +2836,18 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, 
u32 *match_c,
if (ret)
return ret;
break;
+   case IB_FLOW_SPEC_ACTION_COUNT:
+   if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
+LAST_COUNTERS_FIELD))
+   return -EOPNOTSUPP;
+
+   /* for now support only one counters spec per flow */
+   if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+   return -EINVAL;
+
+   action->counters = ib_spec->flow_count.counters;
+   action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+   break;
default:
return -EINVAL;
}
@@ -2983,6 +2995,17 @@ static void put_flow_table(struct mlx5_ib_dev *dev,
}
 }

+static void counters_clear_description(struct ib_counters *counters)
+{
+   struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+   mutex_lock(>mcntrs_mutex);
+   kfree(mcounters->counters_data);
+   mcounters->counters_data = NULL;
+   mcounters->cntrs_max_index = 0;
+   mutex_unlock(>mcntrs_mutex);
+}
+
 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
 {
struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
@@ -3002,8 +3025,11 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)

mlx5_del_flow_rules(handler->rule);
put_flow_table(dev, handler->prio, true);
-   mutex_unlock(>flow_db->lock);
+   if (handler->ibcounters &&
+   atomic_read(>ibcounters->usecnt) == 1)
+   counters_clear_description(handler->ibcounters);

+   mutex_unlock(>flow_db->lock);
kfree(handler);

return 0;
@@ -3123,22 +3149,128 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev,
}
 }

+static int counters_set_description(struct ib_counters *counters,
+   enum mlx5_ib_counters_type counters_type,
+   u32 *desc_data,
+   u32 ncounters)
+{
+   struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+   u32 cntrs_max_index = 0;
+   int i;
+
+   if (counters_type != MLX5_IB_COUNTERS_FLOW)
+   return -EINVAL;
+
+   /* init the fields for the object */
+   mcounters->type = counters_type;
+   mcounters->ncounters = ncounters;
+   /* each counter entry have both description and index pair */
+   for (i = 0; i < ncounters * 2; i += 2) {
+   if (desc_data[i] > IB_COUNTER_BYTES)
+   return -EINVAL;
+
+   if (cntrs_max_index <= desc_data[i + 1])
+   cntrs_max_index = desc_data[i + 1] + 1;
+   }
+
+   mutex_lock(>mcntrs_mutex);
+   mcounters->counters_data = desc_data;
+   mcounters->cntrs_max_index = cntrs_max_index;
+   mutex_unlock(>mcntrs_mutex);
+
+   return 0;
+}
+
+#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
+static int flow_counters_set_data(struct ib_counters *ibcounters,
+ struct mlx5_ib_create_flow *ucmd)
+{
+   struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
+   struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
+   u32 *desc_data = NULL;
+   bool hw_hndl = false;
+   int ret = 0;
+
+   if (ucmd && ucmd->ncounters_data != 0) {
+   cntrs_data = ucmd->data;
+   if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
+   return -EINVAL;
+
+   desc_data = kcalloc(cntrs_data->ncounters,
+   sizeof(u32) * 2,
+   

[PATCH rdma-next v2 10/13] IB/mlx5: Add counters create and destroy support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

This patch implements the device counters create and destroy APIs
and introducing some internal management structures.

Downstream patches in this series will add the functionality to
support flow counters binding and reading.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/hw/mlx5/main.c| 23 +++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 ++
 2 files changed, 33 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index 59f86198eb3b..18bfee86fa52 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5124,6 +5124,27 @@ static void depopulate_specs_root(struct mlx5_ib_dev 
*dev)
uverbs_free_spec_tree(dev->ib_dev.specs_root);
 }

+static int mlx5_ib_destroy_counters(struct ib_counters *counters)
+{
+   struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+   kfree(mcounters);
+
+   return 0;
+}
+
+static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
+  struct uverbs_attr_bundle 
*attrs)
+{
+   struct mlx5_ib_mcounters *mcounters;
+
+   mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
+   if (!mcounters)
+   return ERR_PTR(-ENOMEM);
+
+   return >ibcntrs;
+}
+
 void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
mlx5_ib_cleanup_multiport_master(dev);
@@ -5367,6 +5388,8 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action;
dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp;
dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
+   dev->ib_dev.create_counters = mlx5_ib_create_counters;
+   dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters;

err = init_node_data(dev);
if (err)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h 
b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 49a1aa0ff429..fd27ec1aed08 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -813,6 +813,16 @@ struct mlx5_memic {
DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
 };

+struct mlx5_ib_mcounters {
+   struct ib_counters ibcntrs;
+};
+
+static inline struct mlx5_ib_mcounters *
+to_mcounters(struct ib_counters *ibcntrs)
+{
+   return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs);
+}
+
 struct mlx5_ib_dev {
struct ib_deviceib_dev;
struct mlx5_core_dev*mdev;
--
2.14.3



[PATCH rdma-next v2 04/13] IB/uverbs: Add create/destroy counters support

2018-05-29 Thread Leon Romanovsky
From: Raed Salem 

User space application which uses counters functionality,
is expected to allocate/release the counters resources by
calling create/destroy verbs and in turn get a unique handle
that can be used to attach the counters to its counted type.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Raed Salem 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/core/Makefile   |   2 +-
 drivers/infiniband/core/uverbs.h   |   1 +
 drivers/infiniband/core/uverbs_std_types.c |   3 +-
 .../infiniband/core/uverbs_std_types_counters.c| 100 +
 include/uapi/rdma/ib_user_ioctl_cmds.h |  14 +++
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/uverbs_std_types_counters.c

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 8d42373a2d8a..61667705d746 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -37,4 +37,4 @@ ib_uverbs-y :=uverbs_main.o 
uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_ioctl_merge.o uverbs_std_types_cq.o \
uverbs_std_types_flow_action.o 
uverbs_std_types_dm.o \
-   uverbs_std_types_mr.o
+   uverbs_std_types_mr.o 
uverbs_std_types_counters.o
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index cfb51618ab7a..5b2461fa634d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -287,6 +287,7 @@ extern const struct uverbs_object_def 
UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);

 #define IB_UVERBS_DECLARE_CMD(name)\
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,   \
diff --git a/drivers/infiniband/core/uverbs_std_types.c 
b/drivers/infiniband/core/uverbs_std_types.c
index 569f48bd821e..b570acbd94af 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -302,7 +302,8 @@ static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
  _OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
  _OBJECT(UVERBS_OBJECT_XRCD),
  _OBJECT(UVERBS_OBJECT_FLOW_ACTION),
- _OBJECT(UVERBS_OBJECT_DM));
+ _OBJECT(UVERBS_OBJECT_DM),
+ _OBJECT(UVERBS_OBJECT_COUNTERS));

 const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
 {
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c 
b/drivers/infiniband/core/uverbs_std_types_counters.c
new file mode 100644
index ..a5bc50ceee13
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR 
BSD-2-Clause) */
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include 
+
+static int uverbs_free_counters(struct ib_uobject *uobject,
+  

[PATCH rdma-next v2 07/13] IB/core: Support passing uhw for create_flow

2018-05-29 Thread Leon Romanovsky
From: Matan Barak 

This is required when user-space drivers need to pass extra information
regarding how to handle this flow steering specification.

Tested-by: Michael Guralnik 
Reviewed-by: Yishai Hadas 
Signed-off-by: Matan Barak 
Signed-off-by: Boris Pismenny 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/core/uverbs_cmd.c | 7 ++-
 drivers/infiniband/core/verbs.c  | 2 +-
 drivers/infiniband/hw/mlx4/main.c| 6 +-
 drivers/infiniband/hw/mlx5/main.c| 7 ++-
 include/rdma/ib_verbs.h  | 3 ++-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index e74262ee104c..ddb9d79691be 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3542,11 +3542,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file 
*file,
err = -EINVAL;
goto err_free;
}
-   flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+
+   flow_id = qp->device->create_flow(qp, flow_attr,
+ IB_FLOW_DOMAIN_USER, uhw);
+
if (IS_ERR(flow_id)) {
err = PTR_ERR(flow_id);
goto err_free;
}
+   atomic_inc(>usecnt);
+   flow_id->qp = qp;
flow_id->uobject = uobj;
uobj->object = flow_id;
uflow = container_of(uobj, typeof(*uflow), uobject);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 6ddfb1fade79..0b56828c1319 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1983,7 +1983,7 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp,
if (!qp->device->create_flow)
return ERR_PTR(-EOPNOTSUPP);

-   flow_id = qp->device->create_flow(qp, flow_attr, domain);
+   flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL);
if (!IS_ERR(flow_id)) {
atomic_inc(>usecnt);
flow_id->qp = qp;
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index bf12394c13c1..6fe5d5d1d1d9 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1848,7 +1848,7 @@ static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev 
*dev,

 static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
struct ib_flow_attr *flow_attr,
-   int domain)
+   int domain, struct ib_udata *udata)
 {
int err = 0, i = 0, j = 0;
struct mlx4_ib_flow *mflow;
@@ -1866,6 +1866,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp 
*qp,
(flow_attr->type != IB_FLOW_ATTR_NORMAL))
return ERR_PTR(-EOPNOTSUPP);

+   if (udata &&
+   udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
+   return ERR_PTR(-EOPNOTSUPP);
+
memset(type, 0, sizeof(type));

mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index 25a271ef8374..59f86198eb3b 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3363,7 +3363,8 @@ static struct mlx5_ib_flow_handler 
*create_sniffer_rule(struct mlx5_ib_dev *dev,

 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
   struct ib_flow_attr *flow_attr,
-  int domain)
+  int domain,
+  struct ib_udata *udata)
 {
struct mlx5_ib_dev *dev = to_mdev(qp->device);
struct mlx5_ib_qp *mqp = to_mqp(qp);
@@ -3375,6 +3376,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp 
*qp,
int err;
int underlay_qpn;

+   if (udata &&
+   udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen))
+   return ERR_PTR(-EOPNOTSUPP);
+
if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
return ERR_PTR(-ENOMEM);

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f6bd3b97b971..80956b1c9f4d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2459,7 +2459,8 @@ struct ib_device {
struct ib_flow *   (*create_flow)(struct ib_qp *qp,
  struct ib_flow_attr
  *flow_attr,
- int domain);
+ int domain,
+ struct ib_udata *udata);
int(*destroy_flow)(struct ib_flow *flow_id);
int(*check_mr_status)(struct ib_mr *mr, u32 
check_mask,
   

[PATCH rdma-next v2 01/13] IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure

2018-05-29 Thread Leon Romanovsky
From: Matan Barak 

Previously, the user had to dig inside the attribute to get the uobject.
Add a helper function that correctly extract it (and do the required
checks) for him/her.

Tested-by: Michael Guralnik 
Signed-off-by: Matan Barak 
Signed-off-by: Leon Romanovsky 
---
 drivers/infiniband/core/uverbs_std_types_cq.c  | 23 +++---
 .../infiniband/core/uverbs_std_types_flow_action.c |  4 ++--
 include/rdma/uverbs_ioctl.h| 11 +++
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c 
b/drivers/infiniband/core/uverbs_std_types_cq.c
index b0dbae9dd0d7..3d293d01afea 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -65,7 +65,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct 
ib_device *ib_dev,
struct ib_cq_init_attr attr = {};
struct ib_cq   *cq;
struct ib_uverbs_completion_event_file*ev_file = NULL;
-   const struct uverbs_attr *ev_file_attr;
struct ib_uobject *ev_file_uobj;

if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ))
@@ -87,10 +86,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct 
ib_device *ib_dev,
UVERBS_ATTR_CREATE_CQ_FLAGS)))
return -EFAULT;

-   ev_file_attr = uverbs_attr_get(attrs, 
UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
-   if (!IS_ERR(ev_file_attr)) {
-   ev_file_uobj = ev_file_attr->obj_attr.uobject;
-
+   ev_file_uobj = uverbs_attr_get_uobject(attrs, 
UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
+   if (!IS_ERR(ev_file_uobj)) {
ev_file = container_of(ev_file_uobj,
   struct ib_uverbs_completion_event_file,
   uobj_file.uobj);
@@ -102,8 +99,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct 
ib_device *ib_dev,
goto err_event_file;
}

-   obj = container_of(uverbs_attr_get(attrs,
-  
UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject,
+   obj = container_of(uverbs_attr_get_uobject(attrs,
+  
UVERBS_ATTR_CREATE_CQ_HANDLE),
   typeof(*obj), uobject);
obj->uverbs_file   = ucontext->ufile;
obj->comp_events_reported  = 0;
@@ -170,13 +167,17 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev,
struct ib_uverbs_file *file,
struct uverbs_attr_bundle 
*attrs)
 {
-   struct ib_uverbs_destroy_cq_resp resp;
struct ib_uobject *uobj =
-   uverbs_attr_get(attrs, 
UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
-   struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
-uobject);
+   uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
+   struct ib_uverbs_destroy_cq_resp resp;
+   struct ib_ucq_object *obj;
int ret;

+   if (IS_ERR(uobj))
+   return PTR_ERR(uobj);
+
+   obj = container_of(uobj, struct ib_ucq_object, uobject);
+
if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ))
return -EOPNOTSUPP;

diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c 
b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index b4f016dfa23d..a7be51cf2e42 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -320,7 +320,7 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device
return ret;

/* No need to check as this attribute is marked as MANDATORY */
-   uobj = uverbs_attr_get(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+   uobj = uverbs_attr_get_uobject(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
action = ib_dev->create_flow_action_esp(ib_dev, _attr.hdr, attrs);
if (IS_ERR(action))
return PTR_ERR(action);
@@ -350,7 +350,7 @@ static int 
UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device
if (ret)
return ret;

-   uobj = uverbs_attr_get(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+   uobj = uverbs_attr_get_uobject(attrs, 
UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE);
action = uobj->object;

if (action->type != IB_FLOW_ACTION_ESP)
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 4a4201d997a7..7ac6271a5ee0 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -420,6 +420,17 @@ static inline void *uverbs_attr_get_obj(const struct 
uverbs_attr_bundle 

  1   2   >