Re: [PATCH] samples/bpf: Add xdp_sample_pkts example

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 9:45 AM, Toke Høiland-Jørgensen  wrote:
> This adds an example program showing how to sample packets from XDP using
> the perf event buffer. The example userspace program just prints the
> ethernet header for every packet sampled.
>
> Most of the userspace code is borrowed from other examples, most notably
> trace_output.
>
> Note that the example only works when everything runs on CPU0; so
> suitable smp_affinity needs to be set on the device. Some drivers seem
> to reset smp_affinity when loading an XDP program, so it may be
> necessary to change it after starting the example userspace program.

Why does this only works when everything runs on CPU0? Is this something
we can improve?

Thanks,
Song

>
> Signed-off-by: Toke Høiland-Jørgensen 
> ---
>  samples/bpf/Makefile   |   4 +
>  samples/bpf/xdp_sample_pkts_kern.c |  48 
>  samples/bpf/xdp_sample_pkts_user.c | 147 
> +
>  3 files changed, 199 insertions(+)
>  create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
>  create mode 100644 samples/bpf/xdp_sample_pkts_user.c
>
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 1303af1..6f0c6d2 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
>  hostprogs-y += xdpsock
>  hostprogs-y += xdp_fwd
>  hostprogs-y += task_fd_query
> +hostprogs-y += xdp_sample_pkts
>
>  # Libbpf dependencies
>  LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
> @@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
>  xdpsock-objs := bpf_load.o xdpsock_user.o
>  xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
>  task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
> +xdp_sample_pkts-objs := bpf_load.o xdp_sample_pkts_user.o $(TRACE_HELPERS)
>
>  # Tell kbuild to always build the programs
>  always := $(hostprogs-y)
> @@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
>  always += xdpsock_kern.o
>  always += xdp_fwd_kern.o
>  always += task_fd_query_kern.o
> +always += xdp_sample_pkts_kern.o
>
>  HOSTCFLAGS += -I$(objtree)/usr/include
>  HOSTCFLAGS += -I$(srctree)/tools/lib/
> @@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
>  HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
>  HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
>  HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
> +HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
>
>  HOST_LOADLIBES += $(LIBBPF) -lelf
>  HOSTLOADLIBES_tracex4  += -lrt
> diff --git a/samples/bpf/xdp_sample_pkts_kern.c 
> b/samples/bpf/xdp_sample_pkts_kern.c
> new file mode 100644
> index 000..c58183a
> --- /dev/null
> +++ b/samples/bpf/xdp_sample_pkts_kern.c
> @@ -0,0 +1,48 @@
> +#include 
> +#include 
> +#include 
> +#include "bpf_helpers.h"
> +
> +#define SAMPLE_SIZE 64ul
> +
> +struct bpf_map_def SEC("maps") my_map = {
> +   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> +   .key_size = sizeof(int),
> +   .value_size = sizeof(u32),
> +   .max_entries = 2,
> +};
> +
> +SEC("xdp_sample")
> +int xdp_sample_prog(struct xdp_md *ctx)
> +{
> +   void *data_end = (void *)(long)ctx->data_end;
> +   void *data = (void *)(long)ctx->data;
> +
> +/* Metadata will be in the perf event before the packet data. */
> +   struct S {
> +   u16 cookie;
> +   u16 pkt_len;
> +   } __attribute__((packed)) metadata;
> +
> +   if (data + SAMPLE_SIZE < data_end) {
> +   /* The XDP perf_event_output handler will use the upper 32 
> bits
> +* of the flags argument as a number of bytes to include of 
> the
> +* packet payload in the event data. If the size is too big, 
> the
> +* call to bpf_perf_event_output will fail and return -EFAULT.
> +*
> +* See bpf_xdp_event_output in net/core/filter.c.
> +*/
> +   u64 flags = SAMPLE_SIZE << 32;
> +
> +   metadata.cookie = 0xdead;
> +   metadata.pkt_len = (u16)(data_end - data);
> +
> +   bpf_perf_event_output(ctx, _map, flags,
> + , sizeof(metadata));
> +   }
> +
> +   return XDP_PASS;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> +u32 _version SEC("version") = LINUX_VERSION_CODE;
> diff --git a/samples/bpf/xdp_sample_pkts_user.c 
> b/samples/bpf/xdp_sample_pkts_user.c
> new file mode 100644
> index 000..f996917
> --- /dev/null
> +++ b/samples/bpf/xdp_sample_pkts_user.c
> @@ -0,0 +1,147 @@
> +/* This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> 

[PATCH bpf-next v4 5/5] selftests/bpf: test_sockmap, print additional test options

2018-05-30 Thread Prashant Bhole
Print values of test options like apply, cork, start, end so that
individual failed tests can be identified for manual run

Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 28 +++---
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 2bc70e1ab2eb..05c8cb71724a 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -876,6 +876,8 @@ static char *test_to_str(int test)
 #define OPTSTRING 60
 static void test_options(char *options)
 {
+   char tstr[OPTSTRING];
+
memset(options, 0, OPTSTRING);
 
if (txmsg_pass)
@@ -888,14 +890,22 @@ static void test_options(char *options)
strncat(options, "redir_noisy,", OPTSTRING);
if (txmsg_drop)
strncat(options, "drop,", OPTSTRING);
-   if (txmsg_apply)
-   strncat(options, "apply,", OPTSTRING);
-   if (txmsg_cork)
-   strncat(options, "cork,", OPTSTRING);
-   if (txmsg_start)
-   strncat(options, "start,", OPTSTRING);
-   if (txmsg_end)
-   strncat(options, "end,", OPTSTRING);
+   if (txmsg_apply) {
+   snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_cork) {
+   snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_start) {
+   snprintf(tstr, OPTSTRING, "start %d,", txmsg_start);
+   strncat(options, tstr, OPTSTRING);
+   }
+   if (txmsg_end) {
+   snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
+   strncat(options, tstr, OPTSTRING);
+   }
if (txmsg_ingress)
strncat(options, "ingress,", OPTSTRING);
if (txmsg_skb)
@@ -904,7 +914,7 @@ static void test_options(char *options)
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
 {
-   char *options = calloc(60, sizeof(char));
+   char *options = calloc(OPTSTRING, sizeof(char));
int err;
 
if (test == SENDPAGE)
-- 
2.17.0




[PATCH bpf-next v4 4/5] selftests/bpf: test_sockmap, fix data verification

2018-05-30 Thread Prashant Bhole
When data verification is enabled, some tests fail because verification is done
incorrectly. Following changes fix it.

- Identify the size of data block to be verified
- Reset verification counter when data block size is reached
- Fixed the value printed in case of verfication failure

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 5cd0550af595..2bc70e1ab2eb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -337,8 +337,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
int fd_flags = O_NONBLOCK;
struct timeval timeout;
float total_bytes;
+   int bytes_cnt = 0;
+   int chunk_sz;
fd_set w;
 
+   if (opt->sendpage)
+   chunk_sz = iov_length * cnt;
+   else
+   chunk_sz = iov_length * iov_count;
+
fcntl(fd, fd_flags);
total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
err = clock_gettime(CLOCK_MONOTONIC, >start);
@@ -393,9 +400,14 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
errno = -EIO;
fprintf(stderr,
"detected data 
corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-   i, j, d[j], k - 
1, d[j+1], k + 1);
+   i, j, d[j], k - 
1, d[j+1], k);
goto out_errno;
}
+   bytes_cnt++;
+   if (bytes_cnt == chunk_sz) {
+   k = 0;
+   bytes_cnt = 0;
+   }
recv--;
}
}
-- 
2.17.0




[PATCH bpf-next v4 1/5] selftests/bpf: test_sockmap, check test failure

2018-05-30 Thread Prashant Bhole
Test failures are not identified because exit code of RX/TX threads
is not checked. Also threads are not returning correct exit code.

- Return exit code from threads depending on test execution status
- In main thread, check the exit code of RX/TX threads
- Skip error checking for corked tests as they are expected to timeout

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 27 +-
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index eb17fae458e6..7b2008a144cb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
struct msg_stats s = {0};
int iov_count = opt->iov_count;
int iov_buf = opt->iov_length;
+   int rx_status, tx_status;
int cnt = opt->rate;
-   int status;
 
errno = 0;
 
@@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
rxpid = fork();
if (rxpid == 0) {
if (opt->drop_expected)
-   exit(1);
+   exit(0);
 
if (opt->sendpage)
iov_count = 1;
@@ -463,7 +463,9 @@ static int sendmsg_test(struct sockmap_options *opt)
"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB 
%fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-   exit(1);
+   if (err && txmsg_cork)
+   err = 0;
+   exit(err ? 1 : 0);
} else if (rxpid == -1) {
perror("msg_loop_rx: ");
return errno;
@@ -491,14 +493,27 @@ static int sendmsg_test(struct sockmap_options *opt)
"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB 
%fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-   exit(1);
+   exit(err ? 1 : 0);
} else if (txpid == -1) {
perror("msg_loop_tx: ");
return errno;
}
 
-   assert(waitpid(rxpid, , 0) == rxpid);
-   assert(waitpid(txpid, , 0) == txpid);
+   assert(waitpid(rxpid, _status, 0) == rxpid);
+   assert(waitpid(txpid, _status, 0) == txpid);
+   if (WIFEXITED(rx_status)) {
+   err = WEXITSTATUS(rx_status);
+   if (err) {
+   fprintf(stderr, "rx thread exited with err %d. ", err);
+   goto out;
+   }
+   }
+   if (WIFEXITED(tx_status)) {
+   err = WEXITSTATUS(tx_status);
+   if (err)
+   fprintf(stderr, "tx thread exited with err %d. ", err);
+   }
+out:
return err;
 }
 
-- 
2.17.0




[PATCH bpf-next v4 0/5] fix test_sockmap

2018-05-30 Thread Prashant Bhole
test_sockmap was originally written only to exercise kernel code
paths, so there was no strict checking of errors. When the code was
modified to run as selftests, due to lack of error handling it was not
able to detect test failures.

In order to improve, this series fixes error handling, test run time
and data verification.

Also slightly improved test output by printing parameter values (cork,
apply, start, end) so that parameters for all tests are displayed.

Changes in v4:
  - patch1: Ignore RX timoute error only for corked tests
  - patch3: Setting different timeout for corked tests and reduce
  run time by reducing number of iterations in some tests

Changes in v3:
  - Skipped error checking for corked tests

Prashant Bhole (5):
  selftests/bpf: test_sockmap, check test failure
  selftests/bpf: test_sockmap, join cgroup in selftest mode
  selftests/bpf: test_sockmap, timing improvements
  selftests/bpf: test_sockmap, fix data verification
  selftests/bpf: test_sockmap, print additional test options

 tools/testing/selftests/bpf/test_sockmap.c | 87 +-
 1 file changed, 67 insertions(+), 20 deletions(-)

-- 
2.17.0




[PATCH bpf-next v4 3/5] selftests/bpf: test_sockmap, timing improvements

2018-05-30 Thread Prashant Bhole
Currently 10us delay is too low for many tests to succeed. It needs to
be increased. Also, many corked tests are expected to hit rx timeout
irrespective of timeout value.

- This patch sets 1000usec timeout value for corked tests because less
than that causes broken-pipe error in tx thread. Also sets 1 second
timeout for all other tests because less than that results in RX
timeout
- tests with apply=1 and higher number of iterations were taking lot
of time. This patch reduces test run time by reducing iterations.

real0m12.968s
user0m0.219s
sys 0m14.337s

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 7f9ca79aadbc..5cd0550af595 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,13 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
if (err < 0)
perror("recv start time: ");
while (s->bytes_recvd < total_bytes) {
-   timeout.tv_sec = 0;
-   timeout.tv_usec = 10;
+   if (txmsg_cork) {
+   timeout.tv_sec = 0;
+   timeout.tv_usec = 1000;
+   } else {
+   timeout.tv_sec = 1;
+   timeout.tv_usec = 0;
+   }
 
/* FD sets */
FD_ZERO();
@@ -1025,14 +1030,14 @@ static int test_send(struct sockmap_options *opt, int 
cgrp)
 
opt->iov_length = 1;
opt->iov_count = 1;
-   opt->rate = 1024;
+   opt->rate = 512;
err = test_exec(cgrp, opt);
if (err)
goto out;
 
opt->iov_length = 256;
opt->iov_count = 1024;
-   opt->rate = 10;
+   opt->rate = 2;
err = test_exec(cgrp, opt);
if (err)
goto out;
-- 
2.17.0




[PATCH bpf-next v4 2/5] selftests/bpf: test_sockmap, join cgroup in selftest mode

2018-05-30 Thread Prashant Bhole
In case of selftest mode, temporary cgroup environment is created but
cgroup is not joined. It causes test failures. Fixed by joining the
cgroup

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend 
Signed-off-by: Prashant Bhole 
---
 tools/testing/selftests/bpf/test_sockmap.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 7b2008a144cb..7f9ca79aadbc 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1344,6 +1344,11 @@ static int __test_suite(char *bpf_file)
return cg_fd;
}
 
+   if (join_cgroup(CG_PATH)) {
+   fprintf(stderr, "ERROR: failed to join cgroup\n");
+   return -EINVAL;
+   }
+
/* Tests basic commands and APIs with range of iov values */
txmsg_start = txmsg_end = 0;
err = test_txmsg(cg_fd);
-- 
2.17.0




Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout

2018-05-30 Thread Prashant Bhole




On 5/31/2018 4:59 AM, John Fastabend wrote:

On 05/30/2018 12:29 PM, Alexei Starovoitov wrote:

On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:

In order to reduce runtime of tests, recently timout for select() call
was reduced from 1sec to 10usec. This was causing many tests failures.
It was caught with failure handling commits in this series.

Restoring the timeout from 10usec to 1sec

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole 
---
  tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 64f9e25c451f..9d01f5c2abe2 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
if (err < 0)
perror("recv start time: ");
while (s->bytes_recvd < total_bytes) {
-   timeout.tv_sec = 0;
-   timeout.tv_usec = 10;
+   timeout.tv_sec = 1;
+   timeout.tv_usec = 0;


I've applied the set, but had to revert it, since it takes too long.

real1m40.124s
user0m0.375s
sys 0m14.521s



Dang, I thought it would be a bit longer but not minutes.


Myself and Daniel run the test semi-manually when we apply patches.> Adding 2 
extra minutes of wait time is unnecessary.


Yep.


Especially since most of it is idle time.
Please find a way to fix tests differently.
btw I don't see any failures today. Not sure what is being fixed
by incresing a timeout.



Calling these fixes is a bit much, they are primarily improvements.

The background is, when I originally wrote the tests my goal was to
exercise the kernel code paths. Because of this I didn't really care if
the tests actually sent/recv all bytes in the test. (I have long
running tests using netperf/wrk/apached/etc. for that) But, the manual
tests do have an option to verify the data if specified. The 'verify'
option is a bit fragile in that with the right tests (e.g. drop)
or the certain options (e.g. cork) it can fail which is expected.

What Prashant added was support to actually verify the data correctly.
And also fix a few cgroup handling and some pretty printing as well.
He noticed the low timeout causing issue in these cases though so
increased it.

@Prashant, how about increasing this less dramatically because now
all cork tests are going to stall for 1s unless perfectly aligned.
How about 100us? Or even better we can conditionally set it based
on if tx_cork is set. If tx_cork is set use 1us otherwise use 200us
or something. (1s is really to high in any cases for lo)

Also capturing some of the above in the cover letter would help
folks understand the context a bit better.



I did trial and error for timeout values. Currently 1000us for corked 
tests and 1 sec for other tests works fine. I observed broken-pipe error 
at tx side when timeout was < 1000us.


Also tests with apply=1 and higher number of iterations were taking 
time, so reducing iterations reduces the test run time drastically.


real0m12.968s
user0m0.219s
sys 0m14.337s

Also I will try to explain background in the cover letter of next series.

-Prashant




Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-30 Thread Samudrala, Sridhar

On 5/30/2018 3:53 PM, Jakub Kicinski wrote:

On Wed, 30 May 2018 14:23:06 -0700, Samudrala, Sridhar wrote:

On 5/29/2018 11:33 PM, Jakub Kicinski wrote:

On Tue, 29 May 2018 23:08:11 -0700, Michael Chan wrote:

On Tue, May 29, 2018 at 10:56 PM, Jakub Kicinski wrote:

On Tue, 29 May 2018 20:19:54 -0700, Michael Chan wrote:

On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar wrote:

Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
extended?

+1 it's painful to see this feature being added to the legacy
API :(  Another duplicated configuration knob.
  

I didn't know about that.
  

Shouldn't we enable this via ethtool on the port representor netdev?

We discussed about this.  ethtool on the VF representor will only work
in switchdev mode and also will not support min/max values.

Ethtool channel API may be overdue a rewrite in devlink anyway, but I
feel like implementing switchdev mode and rewriting features in devlink
may be too much to ask.

Totally agreed.  And switchdev mode doesn't seem to be that widely
used at the moment.  Do you have other suggestions besides NDO?

At some points you (Broadcom) were working whole bunch of devlink
configuration options for the PCIe side of the ASIC.  The number of
queues relates to things like number of allocated MSI-X vectors, which
if memory serves me was in your devlink patch set.  In an ideal world
we would try to keep all those in one place :)

For PCIe config there is always the question of what can be configured
at runtime, and what requires a HW reset.  Therefore that devlink API
which could configure current as well as persistent device settings was
quite nice.  I'm not sure if reallocating queues would ever require
PCIe block reset but maybe...  Certainly it seems the notion of min
queues would make more sense in PCIe configuration devlink API than
ethtool channel API to me as well.

Queues are in the grey area between netdev and non-netdev constructs.
They make sense both from PCIe resource allocation perspective (i.e.
devlink PCIe settings) and netdev perspective (ethtool) because they
feed into things like qdisc offloads, maybe per-queue stats etc.

So yes...  IMHO it would be nice to add this to a devlink SR-IOV config
API and/or switchdev representors.  But neither of those are really an
option for you today so IDK :)

One reason why 'switchdev' mode is not yet widely used or enabled by default
could be due to the requirement to program the flow rules only via slow path.

Do you mean the fallback traffic requirement?


Yes.




Would it make sense to relax this requirement and support a mode where port
representors are created and let the PF driver implement a default policy that
adds flow rules for all the VFs to enable connectivity and let the user
add/modify the rules via port representors?

I definitely share your concerns, stopping a major HW vendor from using
this new and preferred mode is not helping us make progress.

The problem is that if we allow this diversion, i.e. driver to implement
some special policy, or pre-populate a bridge in a configuration that
suits the HW we may condition users to expect that as the standard Linux
behaviour.  And we will be stuck with it forever even tho your next gen
HW (ice?) may support correct behaviour.


Yes. ice can support slowpath behavior as required to support OVS offload.
However, i was just wondering if we should have an option to allow switchdev
without slowpath so that the user can use alternate mechanisms to program
the flow rules instead of having to use OVS.




We should perhaps separate switchdev mode from TC flower/OvS offloads.
Is your objective to implement OvS offload or just switchdev mode?

For OvS without proper fallback behaviour you may struggle.

Switchdev mode could be within your reach even without changing the
default rules.  What if you spawned all port netdevs (I dislike the
term representor, sorry, it's confusing people) in down state and then
refuse to bring them up unless user instantiated a bridge that would
behave in a way that your HW can support?  If ports are down you won't
have fallback traffic so no problem to solve.


If we want to use port netdev's admin state to control the link state of the
VFs then this will not work.
We need to only disable TX/RX but admin state and link state need to be
supported on the port netdevs.




[PATCH net-next] net/smc: fix error return code in smc_setsockopt()

2018-05-30 Thread Wei Yongjun
Fix to return error code -EINVAL instead of 0 if optlen is invalid.

Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Wei Yongjun 
---
 net/smc/af_smc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 2c369d4..973b447 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1420,7 +1420,7 @@ static int smc_setsockopt(struct socket *sock, int level, 
int optname,
return rc;
 
if (optlen < sizeof(int))
-   return rc;
+   return -EINVAL;
get_user(val, (int __user *)optval);
 
lock_sock(sk);



[PATCH net-next] net/mlx5: Make function mlx5_fpga_tls_send_teardown_cmd() static

2018-05-30 Thread Wei Yongjun
Fixes the following sparse warning:

drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c:199:6: warning:
 symbol 'mlx5_fpga_tls_send_teardown_cmd' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
index 2104801..c973623 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -196,8 +196,8 @@ static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
 MLX5_GET(tls_flow, flow, direction_sx));
 }
 
-void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, void *flow,
-u32 swid, gfp_t flags)
+static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev,
+   void *flow, u32 swid, gfp_t flags)
 {
struct mlx5_teardown_stream_context *ctx;
struct mlx5_fpga_dma_buf *buf;



[PATCH net-next] hv_netvsc: fix error return code in netvsc_probe()

2018-05-30 Thread Wei Yongjun
Fix to return a negative error code from the failover register fail
error handling case instead of 0, as done elsewhere in this function.

Fixes: 1ff78076d8dd ("netvsc: refactor notifier/event handling code to use the 
failover framework")
Signed-off-by: Wei Yongjun 
---
 drivers/net/hyperv/netvsc_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ebe9642..bef4d55 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2031,8 +2031,10 @@ static int netvsc_probe(struct hv_device *dev,
}
 
net_device_ctx->failover = failover_register(net, _failover_ops);
-   if (IS_ERR(net_device_ctx->failover))
+   if (IS_ERR(net_device_ctx->failover)) {
+   ret = PTR_ERR(net_device_ctx->failover);
goto err_failover;
+   }
 
return ret;



[PATCH bpf-next] xsk: temporarily disable AF_XDP

2018-05-30 Thread Björn Töpel
From: Björn Töpel 

Temporarily disable AF_XDP sockets, and hide uapi.

Signed-off-by: Björn Töpel 
---
 include/{uapi => }/linux/if_xdp.h | 0
 net/xdp/Kconfig   | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename include/{uapi => }/linux/if_xdp.h (100%)

diff --git a/include/uapi/linux/if_xdp.h b/include/linux/if_xdp.h
similarity index 100%
rename from include/uapi/linux/if_xdp.h
rename to include/linux/if_xdp.h
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
index 90e4a7152854..d845606dae7b 100644
--- a/net/xdp/Kconfig
+++ b/net/xdp/Kconfig
@@ -1,5 +1,5 @@
 config XDP_SOCKETS
-   bool "XDP sockets"
+   bool "XDP sockets" if n
depends on BPF_SYSCALL
default n
help
-- 
2.14.1



Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Qing Huang




On 5/30/2018 2:30 PM, Eric Dumazet wrote:

On Wed, May 30, 2018 at 5:08 PM Qing Huang  wrote:


On 5/30/2018 1:50 PM, Eric Dumazet wrote:

On Wed, May 30, 2018 at 4:30 PM Qing Huang  wrote:

On 5/29/2018 9:11 PM, Eric Dumazet wrote:

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.

If KASAN reported issue was really caused by smaller chunk sizes,
changing allocation
order dynamically will eventually hit the same issue.

Sigh, you have little idea of what your patch really did...

The KASAN part only shows the tip of the iceberg, but our main concern
is an increase of memory overhead.

Well, the commit log only mentioned KASAN and but the change here didn't
seem to solve
the issue.

Can you elaborate ?

My patch solves our problems.

Both the memory overhead and KASAN splats are gone.


If KASAN issue was triggered by using smaller chunks, when under memory 
pressure with lots of fragments,
low order memory allocation will do the similar things. So perhaps in 
your test env, memory allocation
and usage is relatively static, that's probably why using larger chunks 
didn't really utilize low order

allocation code path hence no KASAN issue was spotted.

Smaller chunk size in the mlx4 driver is not supposed to cause any 
memory corruption. We will probably
need to continue to investigate this. Can you provide the test command 
that trigger this issue when running
KASAN kernel so we can try to reproduce it in our lab? It could be that 
upstream code is missing some other

fixes.



Alternative is to revert your patch, since we are now very late in 4.17 cycle.

Memory usage has grown a lot with your patch, since each 4KB page needs a full
struct mlx4_icm_chunk (256 bytes of overhead !)

Going to smaller chunks will have some overhead. It depends on the
application though.
What's the total increased memory consumption in your env?

As I explained, your patch adds 256 bytes of overhead per 4KB.

Your changelog did not mentioned that at all, and we discovered this
the hard way.


If you have some concern regarding memory usage, you should bring this 
up during code review.


Repeated failure and retry for lower order allocations could be bad for 
latency too. This wasn't

mentioned in this commit either.

Like I said, how much overhead really depends on the application. 256 
bytes x chunks may not be

significant on a server with lots of memory.


That is pretty intolerable, and is a blocker for us, memory is precious.



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message tomajord...@vger.kernel.org
More majordomo info athttp://vger.kernel.org/majordomo-info.html




Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-30 Thread Jakub Kicinski
On Wed, 30 May 2018 14:23:06 -0700, Samudrala, Sridhar wrote:
> On 5/29/2018 11:33 PM, Jakub Kicinski wrote:
> > On Tue, 29 May 2018 23:08:11 -0700, Michael Chan wrote:  
> >> On Tue, May 29, 2018 at 10:56 PM, Jakub Kicinski wrote:  
> >>> On Tue, 29 May 2018 20:19:54 -0700, Michael Chan wrote:  
>  On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar wrote:  
> > Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to 
> > be
> > extended?  
> >>> +1 it's painful to see this feature being added to the legacy
> >>> API :(  Another duplicated configuration knob.
> >>>  
>  I didn't know about that.
>   
> > Shouldn't we enable this via ethtool on the port representor netdev?  
>  We discussed about this.  ethtool on the VF representor will only work
>  in switchdev mode and also will not support min/max values.  
> >>> Ethtool channel API may be overdue a rewrite in devlink anyway, but I
> >>> feel like implementing switchdev mode and rewriting features in devlink
> >>> may be too much to ask.  
> >> Totally agreed.  And switchdev mode doesn't seem to be that widely
> >> used at the moment.  Do you have other suggestions besides NDO?  
> > At some points you (Broadcom) were working whole bunch of devlink
> > configuration options for the PCIe side of the ASIC.  The number of
> > queues relates to things like number of allocated MSI-X vectors, which
> > if memory serves me was in your devlink patch set.  In an ideal world
> > we would try to keep all those in one place :)
> >
> > For PCIe config there is always the question of what can be configured
> > at runtime, and what requires a HW reset.  Therefore that devlink API
> > which could configure current as well as persistent device settings was
> > quite nice.  I'm not sure if reallocating queues would ever require
> > PCIe block reset but maybe...  Certainly it seems the notion of min
> > queues would make more sense in PCIe configuration devlink API than
> > ethtool channel API to me as well.
> >
> > Queues are in the grey area between netdev and non-netdev constructs.
> > They make sense both from PCIe resource allocation perspective (i.e.
> > devlink PCIe settings) and netdev perspective (ethtool) because they
> > feed into things like qdisc offloads, maybe per-queue stats etc.
> >
> > So yes...  IMHO it would be nice to add this to a devlink SR-IOV config
> > API and/or switchdev representors.  But neither of those are really an
> > option for you today so IDK :)  
> 
> One reason why 'switchdev' mode is not yet widely used or enabled by default
> could be due to the requirement to program the flow rules only via slow path.

Do you mean the fallback traffic requirement?

> Would it make sense to relax this requirement and support a mode where port
> representors are created and let the PF driver implement a default policy that
> adds flow rules for all the VFs to enable connectivity and let the user
> add/modify the rules via port representors?

I definitely share your concerns, stopping a major HW vendor from using
this new and preferred mode is not helping us make progress.

The problem is that if we allow this diversion, i.e. driver to implement
some special policy, or pre-populate a bridge in a configuration that
suits the HW we may condition users to expect that as the standard Linux
behaviour.  And we will be stuck with it forever even tho your next gen
HW (ice?) may support correct behaviour.

We should perhaps separate switchdev mode from TC flower/OvS offloads.
Is your objective to implement OvS offload or just switchdev mode?  

For OvS without proper fallback behaviour you may struggle.

Switchdev mode could be within your reach even without changing the
default rules.  What if you spawned all port netdevs (I dislike the
term representor, sorry, it's confusing people) in down state and then
refuse to bring them up unless user instantiated a bridge that would
behave in a way that your HW can support?  If ports are down you won't
have fallback traffic so no problem to solve.


greetings

2018-05-30 Thread Miss Zeliha ömer Faruk




--
Hello

I have been trying to contact you. Did you get my business proposal?

Best Regards,
Miss.Zeliha ömer faruk
Esentepe Mahallesi Büyükdere
Caddesi Kristal Kule Binasi
No:215 Sisli - Istanbul, Turke


Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-30 Thread Jakub Kicinski
On Wed, 30 May 2018 00:18:39 -0700, Michael Chan wrote:
> On Tue, May 29, 2018 at 11:33 PM, Jakub Kicinski wrote:
> > At some points you (Broadcom) were working whole bunch of devlink
> > configuration options for the PCIe side of the ASIC.  The number of
> > queues relates to things like number of allocated MSI-X vectors, which
> > if memory serves me was in your devlink patch set.  In an ideal world
> > we would try to keep all those in one place :)  
> 
> Yeah, another colleague is now working with Mellanox on something similar.
> 
> One difference between those devlink parameters and these queue
> parameters is that the former are more permanent and global settings.
> For example, number of VFs or number of MSIX per VF are persistent
> settings once they are set and after PCIe reset.  On the other hand,
> these queue settings are pure run-time settings and may be unique for
> each VF.  These are not stored as there is no room in NVRAM to store
> 128 sets or more of these parameters.

Indeed, I think the API must be flexible as to what is persistent and
what is not because HW will certainly differ in that regard.  And
agreed, queues may be a bit of a stretch here, but worth a try.

> Anyway, let me discuss this with my colleague to see if there is a
> natural fit for these queue parameters in the devlink infrastructure
> that they are working on.

Thank you!


Re: [bpf-next V1 PATCH 0/8] bpf/xdp: add flags argument to ndo_xdp_xmit and flag flush operation

2018-05-30 Thread Song Liu
Overall, this set looks good to me. The only suggestion I have is to add more
documentation on the expected behavior of XDP_XMIT_FLUSH in netdevice.h
(as part of 01/08).

Thanks,
Song


On Wed, May 30, 2018 at 11:00 AM, Jesper Dangaard Brouer
 wrote:
> As I mentioned in merge commit 10f678683e4 ("Merge branch 'xdp_xmit-bulking'")
> I plan to change the API for ndo_xdp_xmit once more, by adding a flags
> argument, which is done in this patchset.
>
> I know it is late in the cycle (currently at rc7), but it would be
> nice to avoid changing NDOs over several kernel releases, as it is
> annoying to vendors and distro backporters, but it is not strictly
> UAPI so it is allowed (according to Alexei).
>
> The end-goal is getting rid of the ndo_xdp_flush operation, as it will
> make it possible for drivers to implement a TXQ synchronization mechanism
> that is not necessarily derived from the CPU id (smp_processor_id).
>
> This patchset removes all callers of the ndo_xdp_flush operation, but
> it doesn't take the last step of removing it from all drivers.  This
> can be done later, or I can update the patchset on request.
>
> Micro-benchmarks only show a very small performance improvement, for
> map-redirect around ~2 ns, and for non-map redirect ~7 ns.  I've not
> benchmarked this with CONFIG_RETPOLINE, but the performance benefit
> should be more visible given we end-up removing an indirect call.
>
> ---
>
> Jesper Dangaard Brouer (8):
>   xdp: add flags argument to ndo_xdp_xmit API
>   i40e: implement flush flag for ndo_xdp_xmit
>   ixgbe: implement flush flag for ndo_xdp_xmit
>   tun: implement flush flag for ndo_xdp_xmit
>   virtio_net: implement flush flag for ndo_xdp_xmit
>   xdp: done implementing ndo_xdp_xmit flush flag for all drivers
>   bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
>   bpf/xdp: devmap can avoid calling ndo_xdp_flush
>
>
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |9 -
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   |3 ++-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   23 +--
>  drivers/net/tun.c |   25 
> ++---
>  drivers/net/virtio_net.c  |9 -
>  include/linux/netdevice.h |7 ---
>  include/net/xdp.h |4 
>  kernel/bpf/devmap.c   |   20 +++-
>  net/core/filter.c |3 +--
>  9 files changed, 69 insertions(+), 34 deletions(-)
>
> --


[PATCH net-next] net: dsa: mv88e6xxx: Be explicit about DT or pdata

2018-05-30 Thread Andrew Lunn
Make it explicit that either device tree is used or platform data.  If
neither is available, abort the probe.

Reported-by: Dan Carpenter 
Fixes: 877b7cb0b6f2 ("net: dsa: mv88e6xxx: Add minimal platform_data support")
Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 12df00f593b7..437cd6eb4faa 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4389,6 +4389,9 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
int port;
int err;
 
+   if (!np && !pdata)
+   return -EINVAL;
+
if (np)
compat_info = of_device_get_match_data(dev);
 
-- 
2.17.0



Re: [bpf-next V1 PATCH 8/8] bpf/xdp: devmap can avoid calling ndo_xdp_flush

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 11:01 AM, Jesper Dangaard Brouer
 wrote:
> The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
> instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
> appropriate places.
>
> Notice after this patch it is possible to remove ndo_xdp_flush
> completely, as this is the last user of ndo_xdp_flush. This is left
> for later patches, to keep driver changes separate.
>
> Signed-off-by: Jesper Dangaard Brouer 
> ---
>  kernel/bpf/devmap.c |   20 +++-
>  1 file changed, 7 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 04fbd75a5274..9c846a7a8cff 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>  }
>
>  static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> -struct xdp_bulk_queue *bq)
> +  struct xdp_bulk_queue *bq, bool flush)

How about we use "int flags" instead of "bool flush" for easier extension?

Thanks,
Song

>  {
> struct net_device *dev = obj->dev;
> int sent = 0, drops = 0, err = 0;
> @@ -232,7 +232,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> prefetch(xdpf);
> }
>
> -   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
> +   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q,
> +flush ? XDP_XMIT_FLUSH : 0);
> if (sent < 0) {
> err = sent;
> sent = 0;
> @@ -276,7 +277,6 @@ void __dev_map_flush(struct bpf_map *map)
> for_each_set_bit(bit, bitmap, map->max_entries) {
> struct bpf_dtab_netdev *dev = 
> READ_ONCE(dtab->netdev_map[bit]);
> struct xdp_bulk_queue *bq;
> -   struct net_device *netdev;
>
> /* This is possible if the dev entry is removed by user space
>  * between xdp redirect and flush op.
> @@ -287,10 +287,7 @@ void __dev_map_flush(struct bpf_map *map)
> __clear_bit(bit, bitmap);
>
> bq = this_cpu_ptr(dev->bulkq);
> -   bq_xmit_all(dev, bq);
> -   netdev = dev->dev;
> -   if (likely(netdev->netdev_ops->ndo_xdp_flush))
> -   netdev->netdev_ops->ndo_xdp_flush(netdev);
> +   bq_xmit_all(dev, bq, true);
> }
>  }
>
> @@ -320,7 +317,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct 
> xdp_frame *xdpf,
> struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
>
> if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> -   bq_xmit_all(obj, bq);
> +   bq_xmit_all(obj, bq, false);
>
> /* Ingress dev_rx will be the same for all xdp_frame's in
>  * bulk_queue, because bq stored per-CPU and must be flushed
> @@ -359,8 +356,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, 
> void *key)
>
>  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
> -   if (dev->dev->netdev_ops->ndo_xdp_flush) {
> -   struct net_device *fl = dev->dev;
> +   if (dev->dev->netdev_ops->ndo_xdp_xmit) {
> struct xdp_bulk_queue *bq;
> unsigned long *bitmap;
>
> @@ -371,9 +367,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
> __clear_bit(dev->bit, bitmap);
>
> bq = per_cpu_ptr(dev->bulkq, cpu);
> -   bq_xmit_all(dev, bq);
> -
> -   fl->netdev_ops->ndo_xdp_flush(dev->dev);
> +   bq_xmit_all(dev, bq, true);
> }
> }
>  }
>


Re: [bpf-next V1 PATCH 2/8] i40e: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 11:00 AM, Jesper Dangaard Brouer
 wrote:
> Signed-off-by: Jesper Dangaard Brouer 

I guess we still need to say something in the commit message?

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c |5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index c0451d6e0790..03c1446f0465 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -3685,7 +3685,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
> xdp_frame **frames,
> if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
> return -ENXIO;
>
> -   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> return -EINVAL;
>
> for (i = 0; i < n; i++) {
> @@ -3699,6 +3699,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
> xdp_frame **frames,
> }
> }
>
> +   if (unlikely(flags & XDP_XMIT_FLUSH))
> +   i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
> +
> return n - drops;

Do we still flush when drops > 0?

Thanks,
Song

>  }
>
>


Re: [bpf-next V1 PATCH 1/8] xdp: add flags argument to ndo_xdp_xmit API

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 11:00 AM, Jesper Dangaard Brouer
 wrote:
> This patch only change the API and reject any use of flags. This is an
> intermediate step that allows us to implement the flush flag operation
> later, for each individual driver in a separate patch.
>
> The plan is to implement flush operation via XDP_XMIT_FLUSH flag
> and then remove XDP_XMIT_FLAGS_NONE when done.
>
> Signed-off-by: Jesper Dangaard Brouer 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |6 +-
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   |3 ++-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |5 -
>  drivers/net/tun.c |8 ++--
>  drivers/net/virtio_net.c  |5 -
>  include/linux/netdevice.h |7 ---
>  include/net/xdp.h |5 +
>  kernel/bpf/devmap.c   |2 +-
>  net/core/filter.c |2 +-
>  9 files changed, 32 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 9b698c5acd05..c0451d6e0790 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, 
> struct net_device *netdev)
>   * For error cases, a negative errno code is returned and no-frames
>   * are transmitted (caller must handle freeing frames).
>   **/
> -int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
> + u32 flags)
>  {
> struct i40e_netdev_priv *np = netdev_priv(dev);
> unsigned int queue_index = smp_processor_id();
> @@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
> xdp_frame **frames)
> if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
> return -ENXIO;
>
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   return -EINVAL;
> +
> for (i = 0; i < n; i++) {
> struct xdp_frame *xdpf = frames[i];
> int err;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index eb8804b3d7b6..820f76db251b 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool 
> in_sw);
>  void i40e_detect_recover_hung(struct i40e_vsi *vsi);
>  int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
>  bool __i40e_chk_linearize(struct sk_buff *skb);
> -int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
> + u32 flags);
>  void i40e_xdp_flush(struct net_device *dev);
>
>  /**
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 031d65c4178d..87f088f4af52 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct 
> netdev_bpf *xdp)
>  }
>
>  static int ixgbe_xdp_xmit(struct net_device *dev, int n,
> - struct xdp_frame **frames)
> + struct xdp_frame **frames, u32 flags)
>  {
> struct ixgbe_adapter *adapter = netdev_priv(dev);
> struct ixgbe_ring *ring;
> @@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int 
> n,
> if (unlikely(test_bit(__IXGBE_DOWN, >state)))
> return -ENETDOWN;
>
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   return -EINVAL;
> +
> /* During program transitions its possible adapter->xdp_prog is 
> assigned
>  * but ring has not been configured yet. In this case simply abort 
> xmit.
>  */
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 2265d2ccea47..b182b8cdd219 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
> .ndo_get_stats64= tun_net_get_stats64,
>  };
>
> -static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame 
> **frames)
> +static int tun_xdp_xmit(struct net_device *dev, int n,
> +   struct xdp_frame **frames, u32 flags)
>  {
> struct tun_struct *tun = netdev_priv(dev);
> struct tun_file *tfile;
> @@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, 
> struct xdp_frame **frames
> int cnt = n;
> int i;
>
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   return -EINVAL;
> +
> 

Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Eric Dumazet
On Wed, May 30, 2018 at 5:08 PM Qing Huang  wrote:
>
>
>
> On 5/30/2018 1:50 PM, Eric Dumazet wrote:
> > On Wed, May 30, 2018 at 4:30 PM Qing Huang  wrote:
> >>
> >>
> >> On 5/29/2018 9:11 PM, Eric Dumazet wrote:
> >>> Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
> >>> brought a regression caught in our regression suite, thanks to KASAN.
> >> If KASAN reported issue was really caused by smaller chunk sizes,
> >> changing allocation
> >> order dynamically will eventually hit the same issue.
> > Sigh, you have little idea of what your patch really did...
> >
> > The KASAN part only shows the tip of the iceberg, but our main concern
> > is an increase of memory overhead.
>
> Well, the commit log only mentioned KASAN and but the change here didn't
> seem to solve
> the issue.

Can you elaborate ?

My patch solves our problems.

Both the memory overhead and KASAN splats are gone.

>
> >
> > Alternative is to revert your patch, since we are now very late in 4.17 
> > cycle.
> >
> > Memory usage has grown a lot with your patch, since each 4KB page needs a 
> > full
> > struct mlx4_icm_chunk (256 bytes of overhead !)
>
> Going to smaller chunks will have some overhead. It depends on the
> application though.
> What's the total increased memory consumption in your env?


As I explained, your patch adds 256 bytes of overhead per 4KB.

Your changelog did not mentioned that at all, and we discovered this
the hard way.

That is pretty intolerable, and is a blocker for us, memory is precious.


[PATCH bpf-next] bpf: prevent non-IPv4 socket to be added into sock hash

2018-05-30 Thread Wei Wang
From: Wei Wang 

Sock hash only supports IPv4 socket proto right now.
If a non-IPv4 socket gets stored in the BPF map, sk->sk_prot gets
overwritten with the v4 tcp prot.

Syskaller reported the following related issue on an IPv6 socket:
BUG: KASAN: slab-out-of-bounds in ip6_dst_idev include/net/ip6_fib.h:203 
[inline]
BUG: KASAN: slab-out-of-bounds in ip6_xmit+0x2002/0x23f0 
net/ipv6/ip6_output.c:264
Read of size 8 at addr 8801b300edb0 by task syz-executor888/4522

CPU: 0 PID: 4522 Comm: syz-executor888 Not tainted 4.17.0-rc4+ #17
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 ip6_dst_idev include/net/ip6_fib.h:203 [inline]
 ip6_xmit+0x2002/0x23f0 net/ipv6/ip6_output.c:264
 inet6_csk_xmit+0x377/0x630 net/ipv6/inet6_connection_sock.c:139
 tcp_transmit_skb+0x1be0/0x3e40 net/ipv4/tcp_output.c:1159
 tcp_send_syn_data net/ipv4/tcp_output.c:3441 [inline]
 tcp_connect+0x2207/0x45a0 net/ipv4/tcp_output.c:3480
 tcp_v4_connect+0x1934/0x1d50 net/ipv4/tcp_ipv4.c:272
 __inet_stream_connect+0x943/0x1120 net/ipv4/af_inet.c:655
 tcp_sendmsg_fastopen net/ipv4/tcp.c:1162 [inline]
 tcp_sendmsg_locked+0x2859/0x3ee0 net/ipv4/tcp.c:1209
 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1447
 inet_sendmsg+0x19f/0x690 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:639
 ___sys_sendmsg+0x805/0x940 net/socket.c:2117
 __sys_sendmsg+0x115/0x270 net/socket.c:2155
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2162
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x43ff99
RSP: 002b:7ffc00bd1cf8 EFLAGS: 0217 ORIG_RAX: 002e
RAX: ffda RBX: 004002c8 RCX: 0043ff99
RDX: 2000 RSI: 2580 RDI: 0003
RBP: 006ca018 R08: 004002c8 R09: 004002c8
R10: 004002c8 R11: 0217 R12: 004018c0
R13: 00401950 R14:  R15: 

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Reported-by: syzbot+5c063698bdbfac19f...@syzkaller.appspotmail.com
Signed-off-by: Wei Wang 
Acked-by: Eric Dumazet 
Acked-by: Willem de Bruijn 
---
 kernel/bpf/sockmap.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 3b28955a6383..0e7b88bc3e3f 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2300,6 +2300,11 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   if (skops.sk->sk_family != AF_INET) {
+   fput(socket->file);
+   return -EAFNOSUPPORT;
+   }
+
err = sock_hash_ctx_update_elem(, map, key, flags);
fput(socket->file);
return err;
-- 
2.17.1.1185.g55be947832-goog



[PATCH bpf] bpf: prevent non-ipv4 socket to be added into sock map

2018-05-30 Thread Wei Wang
From: Wei Wang 

Sock map only supports IPv4 socket proto right now.
If a non-IPv4 socket gets stored in the BPF map, sk->sk_prot gets
overwritten with the v4 tcp prot.
It could potentially cause issues when invoking functions from
sk->sk_prot later in the stack.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: Wei Wang 
Acked-by: Eric Dumazet 
Acked-by: Willem de Bruijn 
---
 kernel/bpf/sockmap.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 95a84b2f10ce..1984922f99ee 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1873,6 +1873,11 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EOPNOTSUPP;
}
 
+   if (skops.sk->sk_family != AF_INET) {
+   fput(socket->file);
+   return -EAFNOSUPPORT;
+   }
+
err = sock_map_ctx_update_elem(, map, key, flags);
fput(socket->file);
return err;
-- 
2.17.1.1185.g55be947832-goog



Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.

2018-05-30 Thread Samudrala, Sridhar

On 5/29/2018 11:33 PM, Jakub Kicinski wrote:

On Tue, 29 May 2018 23:08:11 -0700, Michael Chan wrote:

On Tue, May 29, 2018 at 10:56 PM, Jakub Kicinski wrote:

On Tue, 29 May 2018 20:19:54 -0700, Michael Chan wrote:

On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar wrote:

Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
extended?

+1 it's painful to see this feature being added to the legacy
API :(  Another duplicated configuration knob.


I didn't know about that.


Shouldn't we enable this via ethtool on the port representor netdev?

We discussed about this.  ethtool on the VF representor will only work
in switchdev mode and also will not support min/max values.

Ethtool channel API may be overdue a rewrite in devlink anyway, but I
feel like implementing switchdev mode and rewriting features in devlink
may be too much to ask.

Totally agreed.  And switchdev mode doesn't seem to be that widely
used at the moment.  Do you have other suggestions besides NDO?

At some points you (Broadcom) were working whole bunch of devlink
configuration options for the PCIe side of the ASIC.  The number of
queues relates to things like number of allocated MSI-X vectors, which
if memory serves me was in your devlink patch set.  In an ideal world
we would try to keep all those in one place :)

For PCIe config there is always the question of what can be configured
at runtime, and what requires a HW reset.  Therefore that devlink API
which could configure current as well as persistent device settings was
quite nice.  I'm not sure if reallocating queues would ever require
PCIe block reset but maybe...  Certainly it seems the notion of min
queues would make more sense in PCIe configuration devlink API than
ethtool channel API to me as well.

Queues are in the grey area between netdev and non-netdev constructs.
They make sense both from PCIe resource allocation perspective (i.e.
devlink PCIe settings) and netdev perspective (ethtool) because they
feed into things like qdisc offloads, maybe per-queue stats etc.

So yes...  IMHO it would be nice to add this to a devlink SR-IOV config
API and/or switchdev representors.  But neither of those are really an
option for you today so IDK :)


One reason why 'switchdev' mode is not yet widely used or enabled by default
could be due to the requirement to program the flow rules only via slow path.

Would it make sense to relax this requirement and support a mode where port
representors are created and let the PF driver implement a default policy that
adds flow rules for all the VFs to enable connectivity and let the user
add/modify the rules via port representors?



Re: [PATCH bpf-next] bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families

2018-05-30 Thread Alexei Starovoitov
On Wed, May 30, 2018 at 12:24:17PM -0700, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address
> families. Allows userspace to probe for support as more are added
> (e.g., AF_MPLS).
> 
> Signed-off-by: David Ahern 

Applied, Thanks



Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Qing Huang




On 5/30/2018 1:50 PM, Eric Dumazet wrote:

On Wed, May 30, 2018 at 4:30 PM Qing Huang  wrote:



On 5/29/2018 9:11 PM, Eric Dumazet wrote:

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.

If KASAN reported issue was really caused by smaller chunk sizes,
changing allocation
order dynamically will eventually hit the same issue.

Sigh, you have little idea of what your patch really did...

The KASAN part only shows the tip of the iceberg, but our main concern
is an increase of memory overhead.


Well, the commit log only mentioned KASAN and but the change here didn't 
seem to solve

the issue.



Alternative is to revert your patch, since we are now very late in 4.17 cycle.

Memory usage has grown a lot with your patch, since each 4KB page needs a full
struct mlx4_icm_chunk (256 bytes of overhead !)


Going to smaller chunks will have some overhead. It depends on the 
application though.

What's the total increased memory consumption in your env?



Really we have no choice here, your patch went too far and increased
memory consumption quite a lot.






My patch is simply the best way to address your original concern, and
not increase overall overhead.

( each struct mlx4_icm_chunk should be able to store
MLX4_ICM_CHUNK_LEN pages, instead of one page of 4KB )




Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Eric Dumazet
On Wed, May 30, 2018 at 4:30 PM Qing Huang  wrote:
>
>
>
> On 5/29/2018 9:11 PM, Eric Dumazet wrote:
> > Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
> > brought a regression caught in our regression suite, thanks to KASAN.
>
> If KASAN reported issue was really caused by smaller chunk sizes,
> changing allocation
> order dynamically will eventually hit the same issue.

Sigh, you have little idea of what your patch really did...

The KASAN part only shows the tip of the iceberg, but our main concern
is an increase of memory overhead.

Alternative is to revert your patch, since we are now very late in 4.17 cycle.

Memory usage has grown a lot with your patch, since each 4KB page needs a full
struct mlx4_icm_chunk (256 bytes of overhead !)

Really we have no choice here, your patch went too far and increased
memory consumption quite a lot.

My patch is simply the best way to address your original concern, and
not increase overall overhead.

( each struct mlx4_icm_chunk should be able to store
MLX4_ICM_CHUNK_LEN pages, instead of one page of 4KB )


Re: [PATCH v3 11/11] net: sched: change action API to use array of pointers to actions

2018-05-30 Thread Jiri Pirko
Sun, May 27, 2018 at 11:17:29PM CEST, vla...@mellanox.com wrote:
>Act API used linked list to pass set of actions to functions. It is
>intrusive data structure that stores list nodes inside action structure
>itself, which means it is not safe to modify such list concurrently.
>However, action API doesn't use any linked list specific operations on this
>set of actions, so it can be safely refactored into plain pointer array.
>
>Refactor action API to use array of pointers to tc_actions instead of
>linked list. Change argument 'actions' type of exported action init,
>destroy and dump functions.
>
>Signed-off-by: Vlad Buslov 

Even with the nit Marcelo found, this looks fine to me.

Acked-by: Jiri Pirko 


Re: [PATCH net-next v2 0/2] net: phy: improve PHY suspend/resume

2018-05-30 Thread Andrew Lunn
> I think we need a better solution than spending the effort needed
> to make the MDIO ops runtime-pm-aware. In general there seems to be
> just one network driver using both phylib and runtime pm, so most
> drivers aren't affected (yet).
> 
> I will spend few more thoughts on a solution ..

Hi Heiner

Please keep in mind that MDIO is a generic bus. Many Ethernet switches
are connected via MDIO. Some of those switches have MDIO busses of
their own. Also, some Broadcom devices have USB-PHYs controlled over
MDIO, etc.

So you need a generic solution here.

   Andrew


Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Qing Huang




On 5/29/2018 9:11 PM, Eric Dumazet wrote:

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.


If KASAN reported issue was really caused by smaller chunk sizes, 
changing allocation

order dynamically will eventually hit the same issue.



Note that mlx4_alloc_icm() is already able to try high order allocations
and fallback to low-order allocations under high memory pressure.

We only have to tweak gfp_mask a bit, to help falling back faster,
without risking OOM killings.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr 8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G   O
Call Trace:
  [] dump_stack+0x4d/0x72
  [] print_address_description+0x6f/0x260
  [] kasan_report+0x257/0x370
  [] __asan_report_load4_noabort+0x19/0x20
  [] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
  [] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
  [] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
  [] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
  [] seq_read+0xa9c/0x1230
  [] proc_reg_read+0xc1/0x180
  [] __vfs_read+0xe8/0x730
  [] vfs_read+0xf7/0x300
  [] SyS_read+0xd2/0x1b0
  [] do_syscall_64+0x186/0x420
  [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:7ffd09a758c0 EFLAGS: 0293 ORIG_RAX: 
RAX: ffda RBX: 7f84ff959440 RCX: 7f851a7bb30d
RDX: 0003fc00 RSI: 7f84ff60a000 RDI: 000b
RBP: 7ffd09a75900 R08:  R09: 
R10: 0022 R11: 0293 R12: 
R13: 0003 R14: 0003 R15: 7f84ff60a000

Allocated by task 4488:
  save_stack+0x46/0xd0
  kasan_kmalloc+0xad/0xe0
  __kmalloc+0x101/0x5e0
  ib_register_device+0xc03/0x1250 [ib_core]
  mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
  mlx4_add_device+0xa9/0x340 [mlx4_core]
  mlx4_register_interface+0x16e/0x390 [mlx4_core]
  xhci_pci_remove+0x7a/0x180 [xhci_pci]
  do_one_initcall+0xa0/0x230
  do_init_module+0x1b9/0x5a4
  load_module+0x63e6/0x94c0
  SYSC_init_module+0x1a4/0x1c0
  SyS_init_module+0xe/0x10
  do_syscall_64+0x186/0x420
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at 8817df584f40
  which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
  32-byte region [8817df584f40, 8817df584f60)
The buggy address belongs to the page:
page:ea005f7d6100 count:1 mapcount:0 mapping:8817df584000 
index:0x8817df584fc1
flags: 0x8800100(slab)
raw: 08800100 8817df584000 8817df584fc1 0001003f
raw: ea005f3ac0a0 ea005c476760 8817fec00900 883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:883ff78d26c0

Memory state around the buggy address:
  8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
  8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc

8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc

   ^
  8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
  8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
Signed-off-by: Eric Dumazet 
Cc: John Sperbeck 
Cc: Tarick Bedeir 
Cc: Qing Huang 
Cc: Daniel Jurgens 
Cc: Zhu Yanjun 
Cc: Tariq Toukan 
---
  drivers/net/ethernet/mellanox/mlx4/icm.c | 17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 
685337d58276fc91baeeb64387c52985e1bc6dda..cae33d5c7dbd9ba7929adcf2127b104f6796fa5a
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
  #include "fw.h"
  
  /*

- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
   */
  enum {
-   MLX4_ICM_ALLOC_SIZE = PAGE_SIZE,
-   MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
+   MLX4_ICM_ALLOC_SIZE = 1 << 18,
+   MLX4_TABLE_CHUNK_SIZE   = 1 << 18,
  };
  
  static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)

@@ -135,6 +136,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int 
npages,
struct mlx4_icm *icm;
struct mlx4_icm_chunk *chunk = NULL;
int cur_order;
+   gfp_t mask;
int ret;
  
  	/* We use sg_set_buf for coherent allocs, which assumes low memory */

@@ -178,13 +180,16 @@ struct mlx4_icm *mlx4_alloc_icm(struct 

Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress

2018-05-30 Thread Jiri Pirko
Wed, May 30, 2018 at 11:26:23AM CEST, john.hur...@netronome.com wrote:
>On Tue, May 29, 2018 at 11:09 PM, Jiri Pirko  wrote:
>> Tue, May 29, 2018 at 04:08:48PM CEST, john.hur...@netronome.com wrote:
>>>On Sat, May 26, 2018 at 3:47 AM, Jakub Kicinski
>>> wrote:
 On Fri, 25 May 2018 08:48:09 +0200, Jiri Pirko wrote:
> Thu, May 24, 2018 at 04:22:47AM CEST, jakub.kicin...@netronome.com wrote:
> >Hi!
> >
> >This series from John adds bond offload to the nfp driver.  Patch 5
> >exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
> >hashing matches that of the software LAG.  This may be unnecessarily
> >conservative, let's see what LAG maintainers think :)
>
> So you need to restrict offload to only certain hash algo? In mlxsw, we
> just ignore the lag setting and do some hw default hashing. Would not be
> enough? Note that there's a good reason for it, as you see, in team, the
> hashing is done in a BPF function and could be totally arbitrary.
> Your patchset effectively disables team offload for nfp.

 My understanding is that the project requirements only called for L3/L4
 hash algorithm offload, hence the temptation to err on the side of
 caution and not offload all the bond configurations.  John can provide
 more details.  Not being able to offload team is unfortunate indeed.
>>>
>>>Hi Jiri,
>>>Yes, as Jakub mentions, we restrict ourselves to L3/L4 hash algorithm
>>>as this is currently what is supported in fw.
>>
>> In mlxsw, a default l3/l4 is used always, no matter what the
>> bonding/team sets. It is not correct, but it works with team as well.
>> Perhaps we can have NETDEV_LAG_HASH_UNKNOWN to indicate to the driver to
>> do some default? That would make the "team" offload functional.
>>
>
>yes, I would agree with that.
>Thanks

Okay, would you please adjust your driver?

I will teka care of mlxsw bits.

Thanks!

>
>>>Hopefully this will change as fw features are expanded.
>>>I understand the issue this presents with offloading team.
>>>Perhaps resorting to a default hw hash for team is acceptable.
>>>John


Re: suspicius csum initialization in vmxnet3_rx_csum

2018-05-30 Thread Paolo Abeni
Hi,

On Thu, 2018-05-24 at 21:48 +, Guolin Yang wrote:
> Yes, that code  is not correct, we should fix that code

Did you have any chance to address the issue and/or to give a more in-
deepth look to the change proposed in my initial email?

Thanks,

Paolo 


Re: [PATCH net 1/2] ip_tunnel: restore binding to ifaces with a large mtu

2018-05-30 Thread Ido Schimmel
On Wed, May 30, 2018 at 10:28:42AM +0200, Nicolas Dichtel wrote:
> After commit f6cc9c054e77, the following conf is broken (note that the
> default loopback mtu is 65536, ie IP_MAX_MTU + 1):
> 
> $ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev lo
> add tunnel "gre0" failed: Invalid argument
> $ ip l a type dummy
> $ ip l s dummy1 up
> $ ip l s dummy1 mtu 65535
> $ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev dummy1
> add tunnel "gre0" failed: Invalid argument
> 
> dev_set_mtu() doesn't allow to set a mtu which is too large.
> First, let's cap the mtu returned by ip_tunnel_bind_dev(). Second, remove
> the magic value 0xFFF8 and use IP_MAX_MTU instead.
> 0xFFF8 seems to be there for ages, I don't know why this value was used.
> 
> With a recent kernel, it's also possible to set a mtu > IP_MAX_MTU:
> $ ip l s dummy1 mtu 66000
> After that patch, it's also possible to bind an ip tunnel on that kind of
> interface.
> 
> CC: Petr Machata 
> CC: Ido Schimmel 
> Link: 
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/netdev-vger-cvs.git/commit/?id=e5afd356a411a
> Fixes: f6cc9c054e77 ("ip_tunnel: Emit events for post-register MTU changes")
> Signed-off-by: Nicolas Dichtel 

Reviewed-by: Ido Schimmel 

There is another instance of this magic number in the file, but it's
written in lower case so you might have missed it - see
ip_tunnel_newlink(). Can you please take care of it in v2?

Thanks for the fix, Nicolas!


Re: [PATCH net-next v2 0/2] net: phy: improve PHY suspend/resume

2018-05-30 Thread Heiner Kallweit
Am 24.05.2018 um 00:04 schrieb Andrew Lunn:
> On Wed, May 23, 2018 at 10:15:29PM +0200, Heiner Kallweit wrote:
>> I have the issue that suspending the MAC-integrated PHY gives an
>> error during system suspend. The sequence is:
>>
>> 1. unconnected PHY/MAC are runtime-suspended already
>> 2. system suspend commences
>> 3. mdio_bus_phy_suspend is called
>> 4. suspend callback of the network driver is called (implicitly
>>MAC/PHY are runtime-resumed before)
>> 5. suspend callback suspends MAC/PHY
>>
>> The problem occurs in step 3. phy_suspend() fails because the MDIO
>> bus isn't accessible due to the chip being runtime-suspended.
> 
> I think you are fixing the wrong problem. I've had the same with the
> FEC driver. I fixed it by making the MDIO operations runtime-suspend
> aware:
> 
I checked the fec driver and there it's reletively easy because
runtime suspend/resume is just about disabling/enabling one clock.

In case of r8169 runtime suspend/resume do much more and there would
be quite some potential deadlock issues to take care of.
To provide one example:
pm_runtime_get_sync() would be called with the mdio bus lock being
held (from mdio write op), runtime-resuming however includes PHY
writes what would result in a deadlock.

I think we need a better solution than spending the effort needed
to make the MDIO ops runtime-pm-aware. In general there seems to be
just one network driver using both phylib and runtime pm, so most
drivers aren't affected (yet).

I will spend few more thoughts on a solution ..

Regards, Heiner

> commit 8fff755e9f8d0f70a595e79f248695ce6aef5cc3
> Author: Andrew Lunn 
> Date:   Sat Jul 25 22:38:02 2015 +0200
> 
> net: fec: Ensure clocks are enabled while using mdio bus
> 
> When a switch is attached to the mdio bus, the mdio bus can be used
> while the interface is not open. If the IPG clock is not enabled, MDIO
> reads/writes will simply time out.
> 
> Add support for runtime PM to control this clock. Enable/disable this
> clock using runtime PM, with open()/close() and mdio read()/write()
> function triggering runtime PM operations. Since PM is optional, the
> IPG clock is enabled at probe and is no longer modified by
> fec_enet_clk_enable(), thus if PM is not enabled in the kernel, it is
> guaranteed the clock is running when MDIO operations are performed.
> 
> Don't copy this patch 1:1. I introduced a few bugs which took a while
> to be shaken out :-(
> 
>Andrew
> 



[PATCH net-next] net: phy: consider PHY_IGNORE_INTERRUPT in state machine PHY_NOLINK handling

2018-05-30 Thread Heiner Kallweit
We can bail out immediately also in case of PHY_IGNORE_INTERRUPT because
phy_mac_interupt() informs us once the link is up.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef..537297d2 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -894,7 +894,7 @@ void phy_state_machine(struct work_struct *work)
needs_aneg = true;
break;
case PHY_NOLINK:
-   if (phy_interrupt_is_valid(phydev))
+   if (phydev->irq != PHY_POLL)
break;
 
err = phy_read_status(phydev);
-- 
2.17.1



Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout

2018-05-30 Thread John Fastabend
On 05/30/2018 12:29 PM, Alexei Starovoitov wrote:
> On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:
>> In order to reduce runtime of tests, recently timout for select() call
>> was reduced from 1sec to 10usec. This was causing many tests failures.
>> It was caught with failure handling commits in this series.
>>
>> Restoring the timeout from 10usec to 1sec
>>
>> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
>> Signed-off-by: Prashant Bhole 
>> ---
>>  tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
>> b/tools/testing/selftests/bpf/test_sockmap.c
>> index 64f9e25c451f..9d01f5c2abe2 100644
>> --- a/tools/testing/selftests/bpf/test_sockmap.c
>> +++ b/tools/testing/selftests/bpf/test_sockmap.c
>> @@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int 
>> iov_length, int cnt,
>>  if (err < 0)
>>  perror("recv start time: ");
>>  while (s->bytes_recvd < total_bytes) {
>> -timeout.tv_sec = 0;
>> -timeout.tv_usec = 10;
>> +timeout.tv_sec = 1;
>> +timeout.tv_usec = 0;
> 
> I've applied the set, but had to revert it, since it takes too long.
> 
> real  1m40.124s
> user  0m0.375s
> sys   0m14.521s
> 

Dang, I thought it would be a bit longer but not minutes.

> Myself and Daniel run the test semi-manually when we apply patches.> Adding 2 
> extra minutes of wait time is unnecessary.

Yep.

> Especially since most of it is idle time.
> Please find a way to fix tests differently.
> btw I don't see any failures today. Not sure what is being fixed
> by incresing a timeout.
> 

Calling these fixes is a bit much, they are primarily improvements.

The background is, when I originally wrote the tests my goal was to
exercise the kernel code paths. Because of this I didn't really care if
the tests actually sent/recv all bytes in the test. (I have long
running tests using netperf/wrk/apached/etc. for that) But, the manual
tests do have an option to verify the data if specified. The 'verify'
option is a bit fragile in that with the right tests (e.g. drop)
or the certain options (e.g. cork) it can fail which is expected.

What Prashant added was support to actually verify the data correctly.
And also fix a few cgroup handling and some pretty printing as well.
He noticed the low timeout causing issue in these cases though so
increased it.

@Prashant, how about increasing this less dramatically because now
all cork tests are going to stall for 1s unless perfectly aligned.
How about 100us? Or even better we can conditionally set it based
on if tx_cork is set. If tx_cork is set use 1us otherwise use 200us
or something. (1s is really to high in any cases for lo)

Also capturing some of the above in the cover letter would help
folks understand the context a bit better.

Thanks!
John



Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout

2018-05-30 Thread Alexei Starovoitov
On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:
> In order to reduce runtime of tests, recently timout for select() call
> was reduced from 1sec to 10usec. This was causing many tests failures.
> It was caught with failure handling commits in this series.
> 
> Restoring the timeout from 10usec to 1sec
> 
> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
> Signed-off-by: Prashant Bhole 
> ---
>  tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
> b/tools/testing/selftests/bpf/test_sockmap.c
> index 64f9e25c451f..9d01f5c2abe2 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int 
> iov_length, int cnt,
>   if (err < 0)
>   perror("recv start time: ");
>   while (s->bytes_recvd < total_bytes) {
> - timeout.tv_sec = 0;
> - timeout.tv_usec = 10;
> + timeout.tv_sec = 1;
> + timeout.tv_usec = 0;

I've applied the set, but had to revert it, since it takes too long.

real1m40.124s
user0m0.375s
sys 0m14.521s

Myself and Daniel run the test semi-manually when we apply patches.
Adding 2 extra minutes of wait time is unnecessary.
Especially since most of it is idle time.
Please find a way to fix tests differently.
btw I don't see any failures today. Not sure what is being fixed
by incresing a timeout.

Also please mention [PATCH bpf-next] in the subject when you respin.
Thanks!



[PATCH bpf-next] bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families

2018-05-30 Thread dsahern
From: David Ahern 

Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address
families. Allows userspace to probe for support as more are added
(e.g., AF_MPLS).

Signed-off-by: David Ahern 
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 4cff6d9cd724..a2b96e44b2c1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4285,7 +4285,7 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
   flags, true);
 #endif
}
-   return 0;
+   return -EAFNOSUPPORT;
 }
 
 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
@@ -4302,7 +4302,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
struct net *net = dev_net(skb->dev);
-   int index = 0;
+   int index = -EAFNOSUPPORT;
 
if (plen < sizeof(*params))
return -EINVAL;
-- 
2.11.0



[PATCH] iproute2: fix 'ip xfrm monitor all' command

2018-05-30 Thread Nathan Harold
Currently, calling 'ip xfrm monitor all' will
actually invoke the 'all-nsid' command because the
soft-match for 'all-nsid' occurs before the precise
match for 'all'. This patch rearranges the checks
so that the 'all' command, itself an alias for
invoking 'ip xfrm monitor' with no argument, can
be called consistent with the syntax for other ip
commands that accept an 'all'.

Signed-off-by: Nathan Harold 
---
 ip/xfrm_monitor.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ip/xfrm_monitor.c b/ip/xfrm_monitor.c
index 2eabece0..5d086768 100644
--- a/ip/xfrm_monitor.c
+++ b/ip/xfrm_monitor.c
@@ -359,6 +359,8 @@ int do_xfrm_monitor(int argc, char **argv)
if (matches(*argv, "file") == 0) {
NEXT_ARG();
file = *argv;
+   } else if (strcmp(*argv, "all") == 0) {
+   /* fall out */
} else if (matches(*argv, "all-nsid") == 0) {
listen_all_nsid = 1;
} else if (matches(*argv, "acquire") == 0) {
@@ -381,7 +383,7 @@ int do_xfrm_monitor(int argc, char **argv)
groups = 0;
} else if (matches(*argv, "help") == 0) {
usage();
-   } else if (strcmp(*argv, "all")) {
+   } else {
fprintf(stderr, "Argument \"%s\" is unknown, try \"ip 
xfrm monitor help\".\n", *argv);
exit(-1);
}
-- 
2.17.1.1185.g55be947832-goog



[PATCH V2 mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-30 Thread Saeed Mahameed
From: Ilan Tayari 

Temperature warning event is sent by FW to indicate high temperature
as detected by one of the sensors on the board.
Add handling of this event by writing the numbers of the alert sensors
to the kernel log.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 
 include/linux/mlx5/device.h  |  7 ++
 include/linux/mlx5/mlx5_ifc.h|  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index c1c94974e16b..4bd4f011f0a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -141,6 +141,8 @@ static const char *eqe_type_str(u8 type)
return "MLX5_EVENT_TYPE_GPIO_EVENT";
case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+   case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+   return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
case MLX5_EVENT_TYPE_REMOTE_CONFIG:
return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -393,6 +395,20 @@ static void general_event_handler(struct mlx5_core_dev 
*dev,
}
 }
 
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+   struct mlx5_eqe *eqe)
+{
+   u64 value_lsb;
+   u64 value_msb;
+
+   value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+   value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+   mlx5_core_warn(dev,
+  "High temperature on sensors with bit set %llx %llx",
+  value_msb, value_lsb);
+}
+
 /* caller must eventually call mlx5_cq_put on the returned cq */
 static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 {
@@ -547,6 +563,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
mlx5_fpga_event(dev, eqe->type, >data.raw);
break;
 
+   case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+   mlx5_temp_warning_event(dev, eqe);
+   break;
+
case MLX5_EVENT_TYPE_GENERAL_EVENT:
general_event_handler(dev, eqe);
break;
@@ -824,6 +844,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
 
+   if (MLX5_CAP_GEN(dev, temp_warn_event))
+   async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
err = mlx5_create_map_eq(dev, >cmd_eq, MLX5_EQ_VEC_CMD,
 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..eddacee5cf61 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -314,6 +314,7 @@ enum mlx5_event {
MLX5_EVENT_TYPE_PORT_CHANGE= 0x09,
MLX5_EVENT_TYPE_GPIO_EVENT = 0x15,
MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+   MLX5_EVENT_TYPE_TEMP_WARN_EVENT= 0x17,
MLX5_EVENT_TYPE_REMOTE_CONFIG  = 0x19,
MLX5_EVENT_TYPE_GENERAL_EVENT  = 0x22,
MLX5_EVENT_TYPE_PPS_EVENT  = 0x25,
@@ -626,6 +627,11 @@ struct mlx5_eqe_dct {
__be32  dctn;
 };
 
+struct mlx5_eqe_temp_warning {
+   __be64 sensor_warning_msb;
+   __be64 sensor_warning_lsb;
+} __packed;
+
 union ev_data {
__be32  raw[7];
struct mlx5_eqe_cmd cmd;
@@ -642,6 +648,7 @@ union ev_data {
struct mlx5_eqe_port_module port_module;
struct mlx5_eqe_pps pps;
struct mlx5_eqe_dct dct;
+   struct mlx5_eqe_temp_warningtemp_warning;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 10c1613d9434..ba30c26aa6eb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -926,7 +926,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 log_max_msg[0x5];
u8 reserved_at_1c8[0x4];
u8 max_tc[0x4];
-   u8 reserved_at_1d0[0x1];
+   u8 temp_warn_event[0x1];
u8 dcbx[0x1];
u8 general_notification_event[0x1];
u8 reserved_at_1d3[0x2];
-- 
2.17.0



[PATCH V2 mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-30 Thread Saeed Mahameed
From: Ilan Tayari 

The FPGA queue pair (QP) event fires whenever a QP on the FPGA
transitions to the error state.

At this stage, this event is unrecoverable, it may become recoverable
in the future.

Signed-off-by: Ilan Tayari 
Signed-off-by: Adi Nissim 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c |  7 +--
 include/linux/mlx5/device.h  |  1 +
 include/linux/mlx5/mlx5_ifc.h|  1 +
 include/linux/mlx5/mlx5_ifc_fpga.h   | 16 
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 4bd4f011f0a9..77c685645c66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type)
return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
case MLX5_EVENT_TYPE_FPGA_ERROR:
return "MLX5_EVENT_TYPE_FPGA_ERROR";
+   case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+   return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
case MLX5_EVENT_TYPE_GENERAL_EVENT:
return "MLX5_EVENT_TYPE_GENERAL_EVENT";
default:
@@ -560,6 +562,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
break;
 
case MLX5_EVENT_TYPE_FPGA_ERROR:
+   case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
mlx5_fpga_event(dev, eqe->type, >data.raw);
break;
 
@@ -839,11 +842,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
 
if (MLX5_CAP_GEN(dev, fpga))
-   async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+   async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+   (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
if (MLX5_CAP_GEN_MAX(dev, dct))
async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
-
if (MLX5_CAP_GEN(dev, temp_warn_event))
async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eddacee5cf61..71e1dc2523a6 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -331,6 +331,7 @@ enum mlx5_event {
MLX5_EVENT_TYPE_DCT_DRAINED= 0x1c,
 
MLX5_EVENT_TYPE_FPGA_ERROR = 0x20,
+   MLX5_EVENT_TYPE_FPGA_QP_ERROR  = 0x21,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ba30c26aa6eb..3e8845dc85fe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -60,6 +60,7 @@ enum {
MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION= 0xa,
MLX5_EVENT_TYPE_CODING_PAGE_REQUEST= 0xb,
MLX5_EVENT_TYPE_CODING_FPGA_ERROR  = 0x20,
+   MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR   = 0x21
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h 
b/include/linux/mlx5/mlx5_ifc_fpga.h
index ec052491ba3d..7ddca31fa05d 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -432,6 +432,22 @@ struct mlx5_ifc_ipsec_counters_bits {
u8 dropped_cmd[0x40];
 };
 
+enum {
+   MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED  = 0x1,
+   MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED= 0x2,
+};
+
+struct mlx5_ifc_fpga_qp_error_event_bits {
+   u8 reserved_at_0[0x40];
+
+   u8 reserved_at_40[0x18];
+   u8 syndrome[0x8];
+
+   u8 reserved_at_60[0x60];
+
+   u8 reserved_at_c0[0x8];
+   u8 fpga_qpn[0x18];
+};
 enum mlx5_ifc_fpga_ipsec_response_syndrome {
MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
-- 
2.17.0



[PATCH V2 mlx5-next 0/2] Mellanox, mlx5 new device events

2018-05-30 Thread Saeed Mahameed
Hi, 

The following series is for mlx5-next tree [1], it adds the support of two
new device events, from Ilan Tayari:

1. High temperature warnings.
2. FPGA QP error event.

In case of no objection this series will be applied to mlx5-next tree
and will be sent later as a pull request to both rdma and net trees.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-next

v1->v2:
  - improve commit message of the FPGA QP error event patch.

Thanks,
Saeed.

Ilan Tayari (2):
  net/mlx5: Add temperature warning event to log
  net/mlx5: Add FPGA QP error event

 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 28 +++-
 include/linux/mlx5/device.h  |  8 ++
 include/linux/mlx5/mlx5_ifc.h|  3 ++-
 include/linux/mlx5/mlx5_ifc_fpga.h   | 16 +++
 4 files changed, 53 insertions(+), 2 deletions(-)

-- 
2.17.0



[PATCH v2] ixgbe: check ipsec ip addr against mgmt filters

2018-05-30 Thread Shannon Nelson
Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg 
Signed-off-by: Shannon Nelson 
---
v2 - added the BMC IP check

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 88 ++
 1 file changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f..e1c9762 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -445,6 +445,89 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
 }
 
 /**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct ixgbe_hw *hw = >hw;
+   u32 mfval, manc, reg;
+   int num_filters = 4;
+   bool manc_ipv4;
+   u32 bmcipval;
+   int i, j;
+
+#define MANC_EN_IPV4_FILTER  BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)(IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+#define IXGBE_BMCIP(_n)  (0x5050 + ((_n) * 4))
+#define IXGBE_BMCIPVAL   0x5060
+#define BMCIP_V4 0x2
+#define BMCIP_V6 0x3
+#define BMCIP_MASK   0x3
+
+   manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+   manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+   mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+   bmcipval = IXGBE_READ_REG(hw, IXGBE_BMCIPVAL);
+
+   if (xs->props.family == AF_INET) {
+   /* are there any IPv4 filters to check? */
+   if (manc_ipv4) {
+   /* the 4 ipv4 filters are all in MIPAF(3, i) */
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+   continue;
+
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+   }
+
+   if ((bmcipval & BMCIP_MASK) == BMCIP_V4) {
+   reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(3));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+
+   } else {
+   /* if there are ipv4 filters, they are in the last ipv6 slot */
+   if (manc_ipv4)
+   num_filters = 3;
+
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+   continue;
+
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+
+   if ((bmcipval & BMCIP_MASK) == BMCIP_V6) {
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+   }
+
+   return 0;
+}
+
+/**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
  **/
@@ -465,6 +548,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
return -EINVAL;
}
 
+   if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+   netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+   return -EINVAL;
+   }
+
if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
struct rx_sa rsa;
 
-- 
2.7.4



[bpf-next V1 PATCH 7/8] bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush

2018-05-30 Thread Jesper Dangaard Brouer
This is the first real user of the XDP_XMIT_FLUSH flag.

As pointed out many times, XDP_REDIRECT without using BPF maps is
significant slower than the map variant.  This is primary due to the
lack of bulking, as the ndo_xdp_flush operation is required after each
frame (to avoid frames hanging on the egress device).

It is still possible to optimize this case.  Instead of invoking two
NDO indirect calls, which are very expensive with CONFIG_RETPOLINE,
instead instruct ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag.

Signed-off-by: Jesper Dangaard Brouer 
---
 net/core/filter.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 6a21dbcad350..6981b4608979 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,10 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
if (unlikely(!xdpf))
return -EOVERFLOW;
 
-   sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, , 0);
+   sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, , XDP_XMIT_FLUSH);
if (sent <= 0)
return sent;
-   dev->netdev_ops->ndo_xdp_flush(dev);
return 0;
 }
 



[bpf-next V1 PATCH 5/8] virtio_net: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Jesper Dangaard Brouer
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/virtio_net.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ed823625953..62ba8aadd8e6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -481,7 +481,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
int err;
int i;
 
-   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
 
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
@@ -507,6 +507,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
drops++;
}
}
+
+   if (flags & XDP_XMIT_FLUSH)
+   virtqueue_kick(sq->vq);
+
return n - drops;
 }
 



[bpf-next V1 PATCH 6/8] xdp: done implementing ndo_xdp_xmit flush flag for all drivers

2018-05-30 Thread Jesper Dangaard Brouer
Removing XDP_XMIT_FLAGS_NONE as all driver now implement
a flush operation in their ndo_xdp_xmit call.  The compiler
will catch if any users of XDP_XMIT_FLAGS_NONE remains.

Signed-off-by: Jesper Dangaard Brouer 
---
 include/net/xdp.h |1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 308a4b30b484..0bc304b80cdf 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,7 +41,6 @@ enum xdp_mem_type {
 };
 
 /* XDP flags for ndo_xdp_xmit */
-#define XDP_XMIT_FLAGS_NONE0U
 #define XDP_XMIT_FLUSH (1U << 0)
 #define XDP_XMIT_FLAGS_MASKXDP_XMIT_FLUSH
 



[bpf-next V1 PATCH 8/8] bpf/xdp: devmap can avoid calling ndo_xdp_flush

2018-05-30 Thread Jesper Dangaard Brouer
The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
appropriate places.

Notice after this patch it is possible to remove ndo_xdp_flush
completely, as this is the last user of ndo_xdp_flush. This is left
for later patches, to keep driver changes separate.

Signed-off-by: Jesper Dangaard Brouer 
---
 kernel/bpf/devmap.c |   20 +++-
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 04fbd75a5274..9c846a7a8cff 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-struct xdp_bulk_queue *bq)
+  struct xdp_bulk_queue *bq, bool flush)
 {
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0;
@@ -232,7 +232,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
prefetch(xdpf);
}
 
-   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
+   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q,
+flush ? XDP_XMIT_FLUSH : 0);
if (sent < 0) {
err = sent;
sent = 0;
@@ -276,7 +277,6 @@ void __dev_map_flush(struct bpf_map *map)
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct xdp_bulk_queue *bq;
-   struct net_device *netdev;
 
/* This is possible if the dev entry is removed by user space
 * between xdp redirect and flush op.
@@ -287,10 +287,7 @@ void __dev_map_flush(struct bpf_map *map)
__clear_bit(bit, bitmap);
 
bq = this_cpu_ptr(dev->bulkq);
-   bq_xmit_all(dev, bq);
-   netdev = dev->dev;
-   if (likely(netdev->netdev_ops->ndo_xdp_flush))
-   netdev->netdev_ops->ndo_xdp_flush(netdev);
+   bq_xmit_all(dev, bq, true);
}
 }
 
@@ -320,7 +317,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct 
xdp_frame *xdpf,
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-   bq_xmit_all(obj, bq);
+   bq_xmit_all(obj, bq, false);
 
/* Ingress dev_rx will be the same for all xdp_frame's in
 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -359,8 +356,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void 
*key)
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
-   if (dev->dev->netdev_ops->ndo_xdp_flush) {
-   struct net_device *fl = dev->dev;
+   if (dev->dev->netdev_ops->ndo_xdp_xmit) {
struct xdp_bulk_queue *bq;
unsigned long *bitmap;
 
@@ -371,9 +367,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
__clear_bit(dev->bit, bitmap);
 
bq = per_cpu_ptr(dev->bulkq, cpu);
-   bq_xmit_all(dev, bq);
-
-   fl->netdev_ops->ndo_xdp_flush(dev->dev);
+   bq_xmit_all(dev, bq, true);
}
}
 }



[bpf-next V1 PATCH 0/8] bpf/xdp: add flags argument to ndo_xdp_xmit and flag flush operation

2018-05-30 Thread Jesper Dangaard Brouer
As I mentioned in merge commit 10f678683e4 ("Merge branch 'xdp_xmit-bulking'")
I plan to change the API for ndo_xdp_xmit once more, by adding a flags
argument, which is done in this patchset.

I know it is late in the cycle (currently at rc7), but it would be
nice to avoid changing NDOs over several kernel releases, as it is
annoying to vendors and distro backporters, but it is not strictly
UAPI so it is allowed (according to Alexei).

The end-goal is getting rid of the ndo_xdp_flush operation, as it will
make it possible for drivers to implement a TXQ synchronization mechanism
that is not necessarily derived from the CPU id (smp_processor_id).

This patchset removes all callers of the ndo_xdp_flush operation, but
it doesn't take the last step of removing it from all drivers.  This
can be done later, or I can update the patchset on request.

Micro-benchmarks only show a very small performance improvement, for
map-redirect around ~2 ns, and for non-map redirect ~7 ns.  I've not
benchmarked this with CONFIG_RETPOLINE, but the performance benefit
should be more visible given we end-up removing an indirect call.

---

Jesper Dangaard Brouer (8):
  xdp: add flags argument to ndo_xdp_xmit API
  i40e: implement flush flag for ndo_xdp_xmit
  ixgbe: implement flush flag for ndo_xdp_xmit
  tun: implement flush flag for ndo_xdp_xmit
  virtio_net: implement flush flag for ndo_xdp_xmit
  xdp: done implementing ndo_xdp_xmit flush flag for all drivers
  bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
  bpf/xdp: devmap can avoid calling ndo_xdp_flush


 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |9 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   23 +--
 drivers/net/tun.c |   25 ++---
 drivers/net/virtio_net.c  |9 -
 include/linux/netdevice.h |7 ---
 include/net/xdp.h |4 
 kernel/bpf/devmap.c   |   20 +++-
 net/core/filter.c |3 +--
 9 files changed, 69 insertions(+), 34 deletions(-)

--


[bpf-next V1 PATCH 3/8] ixgbe: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Jesper Dangaard Brouer
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87f088f4af52..4fd77c9067f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10022,6 +10022,15 @@ static int ixgbe_xdp(struct net_device *dev, struct 
netdev_bpf *xdp)
}
 }
 
+static void ixgbe_xdp_ring_update_tail(struct ixgbe_ring *ring)
+{
+   /* Force memory writes to complete before letting h/w know there
+* are new descriptors to fetch.
+*/
+   wmb();
+   writel(ring->next_to_use, ring->tail);
+}
+
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
  struct xdp_frame **frames, u32 flags)
 {
@@ -10033,7 +10042,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
if (unlikely(test_bit(__IXGBE_DOWN, >state)))
return -ENETDOWN;
 
-   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
 
/* During program transitions its possible adapter->xdp_prog is assigned
@@ -10054,6 +10063,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
}
}
 
+   if (unlikely(flags & XDP_XMIT_FLUSH))
+   ixgbe_xdp_ring_update_tail(ring);
+
return n - drops;
 }
 
@@ -10072,11 +10084,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
if (unlikely(!ring))
return;
 
-   /* Force memory writes to complete before letting h/w know there
-* are new descriptors to fetch.
-*/
-   wmb();
-   writel(ring->next_to_use, ring->tail);
+   ixgbe_xdp_ring_update_tail(ring);
 
return;
 }



[bpf-next V1 PATCH 4/8] tun: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Jesper Dangaard Brouer
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/tun.c |   19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b182b8cdd219..d82a05fb0594 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,6 +1285,14 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64= tun_net_get_stats64,
 };
 
+static void __tun_xdp_flush_tfile(struct tun_file *tfile)
+{
+   /* Notify and wake up reader process */
+   if (tfile->flags & TUN_FASYNC)
+   kill_fasync(>fasync, SIGIO, POLL_IN);
+   tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+}
+
 static int tun_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames, u32 flags)
 {
@@ -1295,7 +1303,7 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
int cnt = n;
int i;
 
-   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
 
rcu_read_lock();
@@ -1325,6 +1333,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
}
spin_unlock(>tx_ring.producer_lock);
 
+   if (flags & XDP_XMIT_FLUSH)
+   __tun_xdp_flush_tfile(tfile);
+
rcu_read_unlock();
return cnt - drops;
 }
@@ -1353,11 +1364,7 @@ static void tun_xdp_flush(struct net_device *dev)
 
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]);
-   /* Notify and wake up reader process */
-   if (tfile->flags & TUN_FASYNC)
-   kill_fasync(>fasync, SIGIO, POLL_IN);
-   tfile->socket.sk->sk_data_ready(tfile->socket.sk);
-
+   __tun_xdp_flush_tfile(tfile);
 out:
rcu_read_unlock();
 }



[bpf-next V1 PATCH 2/8] i40e: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Jesper Dangaard Brouer
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c0451d6e0790..03c1446f0465 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3685,7 +3685,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
xdp_frame **frames,
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO;
 
-   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
 
for (i = 0; i < n; i++) {
@@ -3699,6 +3699,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
xdp_frame **frames,
}
}
 
+   if (unlikely(flags & XDP_XMIT_FLUSH))
+   i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
+
return n - drops;
 }
 



[bpf-next V1 PATCH 1/8] xdp: add flags argument to ndo_xdp_xmit API

2018-05-30 Thread Jesper Dangaard Brouer
This patch only change the API and reject any use of flags. This is an
intermediate step that allows us to implement the flush flag operation
later, for each individual driver in a separate patch.

The plan is to implement flush operation via XDP_XMIT_FLUSH flag
and then remove XDP_XMIT_FLAGS_NONE when done.

Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |6 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |5 -
 drivers/net/tun.c |8 ++--
 drivers/net/virtio_net.c  |5 -
 include/linux/netdevice.h |7 ---
 include/net/xdp.h |5 +
 kernel/bpf/devmap.c   |2 +-
 net/core/filter.c |2 +-
 9 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..c0451d6e0790 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, 
struct net_device *netdev)
  * For error cases, a negative errno code is returned and no-frames
  * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags)
 {
struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id();
@@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
xdp_frame **frames)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO;
 
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   return -EINVAL;
+
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
int err;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index eb8804b3d7b6..820f76db251b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 031d65c4178d..87f088f4af52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct 
netdev_bpf *xdp)
 }
 
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
- struct xdp_frame **frames)
+ struct xdp_frame **frames, u32 flags)
 {
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
@@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
if (unlikely(test_bit(__IXGBE_DOWN, >state)))
return -ENETDOWN;
 
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   return -EINVAL;
+
/* During program transitions its possible adapter->xdp_prog is assigned
 * but ring has not been configured yet. In this case simply abort xmit.
 */
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2265d2ccea47..b182b8cdd219 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame 
**frames)
+static int tun_xdp_xmit(struct net_device *dev, int n,
+   struct xdp_frame **frames, u32 flags)
 {
struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile;
@@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, 
struct xdp_frame **frames
int cnt = n;
int i;
 
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+   return -EINVAL;
+
rcu_read_lock();
 
numqueues = READ_ONCE(tun->numqueues);
@@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct 
xdp_buff *xdp)
if (unlikely(!frame))
return -EOVERFLOW;
 
-   return tun_xdp_xmit(dev, 1, );
+   return 

[net] ixgbe: fix parsing of TC actions for HW offload

2018-05-30 Thread Jeff Kirsher
From: Ondřej Hlavatý 

The previous code was optimistic, accepting the offload of whole action
chain when there was a single known action (drop/redirect). This results
in offloading a rule which should not be offloaded, because its behavior
cannot be reproduced in the hardware.

For example:

$ tc filter add dev eno1 parent : protocol ip \
u32 ht 800: order 1 match tcp src 42  \
action mirred egress mirror dev enp1s16 pipe \
drop

The controller is unable to mirror the packet to a VF, but still
offloads the rule by dropping the packet.

Change the approach of the function to a pessimistic one, rejecting the
chain when an unknown action is found. This is better suited for future
extensions.

Note that both recognized actions always return TC_ACT_SHOT, therefore
it is safe to ignore actions behind them.

Cc: Jamal Hadi Salim 
Cc: Jiri Pirko 
Signed-off-by: Ondřej Hlavatý 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..d01e1f0280cf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9054,7 +9054,6 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 {
const struct tc_action *a;
LIST_HEAD(actions);
-   int err;
 
if (!tcf_exts_has_actions(exts))
return -EINVAL;
@@ -9075,14 +9074,14 @@ static int parse_tc_actions(struct ixgbe_adapter 
*adapter,
 
if (!dev)
return -EINVAL;
-   err = handle_redirect_action(adapter, dev->ifindex, 
queue,
-action);
-   if (err == 0)
-   return err;
+   return handle_redirect_action(adapter, dev->ifindex,
+ queue, action);
}
+
+   return -EINVAL;
}
 
-   return -EINVAL;
+   return 0;
 }
 #else
 static int parse_tc_actions(struct ixgbe_adapter *adapter,
-- 
2.17.0



Re: [PATCH] b53: Add brcm5389 support

2018-05-30 Thread Florian Fainelli



On 05/30/2018 08:33 AM, Damien Thébault wrote:
> This patch adds support for the BCM5389 switch connected through MDIO.

This looks good, please do address the following couple of things:
- subject should be: net: dsa: b53: Add BCM5389 support
- you also need to update
Documentation/devicetree/bindings/net/dsa/b53.txt with the compatible string

Thank you!

> 
> Signed-off-by: Damien Thébault 
> ---
>  drivers/net/dsa/b53/b53_common.c | 13 +
>  drivers/net/dsa/b53/b53_mdio.c   |  5 -
>  drivers/net/dsa/b53/b53_priv.h   |  1 +
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/dsa/b53/b53_common.c 
> b/drivers/net/dsa/b53/b53_common.c
> index 78616787f2a3..3da5fca77cbd 100644
> --- a/drivers/net/dsa/b53/b53_common.c
> +++ b/drivers/net/dsa/b53/b53_common.c
> @@ -1711,6 +1711,18 @@ static const struct b53_chip_data b53_switch_chips[] = 
> {
>   .cpu_port = B53_CPU_PORT_25,
>   .duplex_reg = B53_DUPLEX_STAT_FE,
>   },
> + {
> + .chip_id = BCM5389_DEVICE_ID,
> + .dev_name = "BCM5389",
> + .vlans = 4096,
> + .enabled_ports = 0x1f,
> + .arl_entries = 4,
> + .cpu_port = B53_CPU_PORT,
> + .vta_regs = B53_VTA_REGS,
> + .duplex_reg = B53_DUPLEX_STAT_GE,
> + .jumbo_pm_reg = B53_JUMBO_PORT_MASK,
> + .jumbo_size_reg = B53_JUMBO_MAX_SIZE,
> + },
>   {
>   .chip_id = BCM5395_DEVICE_ID,
>   .dev_name = "BCM5395",
> @@ -2034,6 +2046,7 @@ int b53_switch_detect(struct b53_device *dev)
>   else
>   dev->chip_id = BCM5365_DEVICE_ID;
>   break;
> + case BCM5389_DEVICE_ID:
>   case BCM5395_DEVICE_ID:
>   case BCM5397_DEVICE_ID:
>   case BCM5398_DEVICE_ID:
> diff --git a/drivers/net/dsa/b53/b53_mdio.c b/drivers/net/dsa/b53/b53_mdio.c
> index fa7556f5d4fb..a533a90e3904 100644
> --- a/drivers/net/dsa/b53/b53_mdio.c
> +++ b/drivers/net/dsa/b53/b53_mdio.c
> @@ -285,6 +285,7 @@ static const struct b53_io_ops b53_mdio_ops = {
>  #define B53_BRCM_OUI_1   0x0143bc00
>  #define B53_BRCM_OUI_2   0x03625c00
>  #define B53_BRCM_OUI_3   0x00406000
> +#define B53_BRCM_OUI_4   0x01410c00
>  
>  static int b53_mdio_probe(struct mdio_device *mdiodev)
>  {
> @@ -311,7 +312,8 @@ static int b53_mdio_probe(struct mdio_device *mdiodev)
>*/
>   if ((phy_id & 0xfc00) != B53_BRCM_OUI_1 &&
>   (phy_id & 0xfc00) != B53_BRCM_OUI_2 &&
> - (phy_id & 0xfc00) != B53_BRCM_OUI_3) {
> + (phy_id & 0xfc00) != B53_BRCM_OUI_3 &&
> + (phy_id & 0xfc00) != B53_BRCM_OUI_4) {
>   dev_err(>dev, "Unsupported device: 0x%08x\n", phy_id);
>   return -ENODEV;
>   }
> @@ -360,6 +362,7 @@ static const struct of_device_id b53_of_match[] = {
>   { .compatible = "brcm,bcm53125" },
>   { .compatible = "brcm,bcm53128" },
>   { .compatible = "brcm,bcm5365" },
> + { .compatible = "brcm,bcm5389" },
>   { .compatible = "brcm,bcm5395" },
>   { .compatible = "brcm,bcm5397" },
>   { .compatible = "brcm,bcm5398" },
> diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
> index 1187ebd79287..3b57f47d0e79 100644
> --- a/drivers/net/dsa/b53/b53_priv.h
> +++ b/drivers/net/dsa/b53/b53_priv.h
> @@ -48,6 +48,7 @@ struct b53_io_ops {
>  enum {
>   BCM5325_DEVICE_ID = 0x25,
>   BCM5365_DEVICE_ID = 0x65,
> + BCM5389_DEVICE_ID = 0x89,
>   BCM5395_DEVICE_ID = 0x95,
>   BCM5397_DEVICE_ID = 0x97,
>   BCM5398_DEVICE_ID = 0x98,
> 

-- 
Florian


Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-30 Thread Saeed Mahameed
On Wed, 2018-05-30 at 18:17 +0200, Andrew Lunn wrote:
> > Hi Andrew, yes the temperature is available by other means, this
> > patch
> > is needed for alert information reasons in order to know which
> > internal
> > sensors triggered the alarm.
> > We are working in parallel to expose temperature sensor to hwmon,
> > but
> > this is still WIP.
> > 
> > 
> > Is it ok to have both ?
> 
> Hi Saeed
> 
> Ideally no. hwmon has mechanisms for setting alarm thresholds, and
> indicating the thresholds have been exceeded. There are also ways to
> tie this to thermal zones, so the system can react on overheating,
> slow down the CPU, drop voltages, ramp up fans, etc. hwmon should be
> your primary interface, not dmesg.
> 

Yes we are working on this, but it is not something that can happen
soon since we need to define the correct Firmware APIs which are still
WIP.

> But if you are stuck doing things in the wrong order, i guess it is
> O.K. I don't think dmesg is a Binary API, so you can remove it later.
> 

Yes this is the plan, once the hwmon is supported we will remove the
extra dmesg warnings.

>  Andrew

Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-30 Thread Saeed Mahameed
On Wed, 2018-05-30 at 18:21 +0200, Andrew Lunn wrote:
> On Wed, May 30, 2018 at 03:14:20PM +, Saeed Mahameed wrote:
> > On Wed, 2018-05-30 at 03:07 +0200, Andrew Lunn wrote:
> > > On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> > > > From: Ilan Tayari 
> > > > 
> > > > The FPGA QP event fires whenever a QP on the FPGA trasitions
> > > > to the error state.
> > > 
> > > FPGA i know, field programmable gate array. Could you offer some
> > > clue
> > > as to what QP means?
> > > 
> > 
> > Hi Andre, QP "Queue Pair" is well known rdma concept, and widely
> > used
> > in mlx drivers, it is used as in the driver as a ring buffer to
> > communicate with the FPGA device.
> 
> O.K. Thanks.
> 
> It is hard to get the right balance between assuming people know
> everything, and know nothing. But you can help teach people these
> terms i commit messages:
> 
>   The FPGA queue pair event fires whenever a QP on the FPGA
>   transitions to the error state.
> 

Sure will fix the commit message in V2,

Thanks a lot for the feedback.

>Andrew

Re: [v2] MAINTAINERS: add myself as maintainer for QorIQ PTP clock driver

2018-05-30 Thread David Miller
From: Yangbo Lu 
Date: Tue, 29 May 2018 11:47:44 +0800

> Added myself as maintainer for QorIQ PTP clock driver.
> Since gianfar_ptp.c was renamed to ptp_qoriq.c, let's
> maintain it under QorIQ PTP clock driver.
> 
> Signed-off-by: Yangbo Lu 
> ---
> Changes for v2:
>   - Dropped dpaa2/rtc part.

Applied to net-next, thanks.


Re: [PATCH] drivers/net: Fix various unnecessary characters after logging newlines

2018-05-30 Thread David Miller
From: Joe Perches 
Date: Mon, 28 May 2018 19:51:57 -0700

> Remove and coalesce formats when there is an unnecessary
> character after a logging newline.  These extra characters
> cause logging defects.
> 
> Miscellanea:
> 
> o Coalesce formats
> 
> Signed-off-by: Joe Perches 

Applied to net-next, thanks Joe.


Re: [PATCH bpf-next 08/11] bpf: fix cbpf parser bug for octal numbers

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Range is 0-7, not 0-9, otherwise parser silently excludes it from the
> strtol() rather than throwing an error.
>
> Reported-by: Marc Boschma 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  tools/bpf/bpf_exp.l | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/bpf/bpf_exp.l b/tools/bpf/bpf_exp.l
> index bd83149..4da8d05 100644
> --- a/tools/bpf/bpf_exp.l
> +++ b/tools/bpf/bpf_exp.l
> @@ -175,7 +175,7 @@ extern void yyerror(const char *str);
> yylval.number = strtol(yytext, NULL, 10);
> return number;
> }
> -([0][0-9]+){
> +([0][0-7]+){
> yylval.number = strtol(yytext + 1, NULL, 8);
> return number;
> }
> --
> 2.9.5
>


Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo

2018-05-30 Thread Jesper Dangaard Brouer
On Wed, 30 May 2018 09:15:25 -0700
Song Liu  wrote:

> On Tue, May 29, 2018 at 12:55 PM, Daniel Borkmann  
> wrote:
> > On 05/29/2018 07:27 PM, Jesper Dangaard Brouer wrote:  
> >> On Mon, 28 May 2018 02:43:37 +0200
> >> Daniel Borkmann  wrote:
> >>  
> >>> Its trivial and straight forward to expose it for scripts that can
> >>> then use it along with bpftool in order to inspect an individual
> >>> application's used maps and progs. Right now we dump some basic
> >>> information in the fdinfo file but with the help of the map/prog
> >>> id full introspection becomes possible now.
> >>>
> >>> Signed-off-by: Daniel Borkmann 
> >>> Acked-by: Alexei Starovoitov   
> 
> Acked-by: Song Liu 
> 
> >>
> >> AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
> >> if this change is handled gracefully by tc ?  
> >
> > Yep, it works just fine, I also tested it before submission.  

Sounds good.

Acked-by: Jesper Dangaard Brouer 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next 07/11] bpf: make sure to clear unused fields in tunnel/xfrm state fetch

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Since the remaining bits are not filled in struct bpf_tunnel_key
> resp. struct bpf_xfrm_state and originate from uninitialized stack
> space, we should make sure to clear them before handing control
> back to the program.
>
> Also add a padding element to struct bpf_xfrm_state for future use
> similar as we have in struct bpf_tunnel_key and clear it as well.
>
>   struct bpf_xfrm_state {
>   __u32  reqid;/* 0 4 */
>   __u32  spi;  /* 4 4 */
>   __u16  family;   /* 8 2 */
>
>   /* XXX 2 bytes hole, try to pack */
>
>   union {
>   __u32  remote_ipv4;  /*   4 */
>   __u32  remote_ipv6[4];   /*  16 */
>   };   /*1216 */
>
>   /* size: 28, cachelines: 1, members: 4 */
>   /* sum members: 26, holes: 1, sum holes: 2 */
>   /* last cacheline: 28 bytes */
>   };
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/uapi/linux/bpf.h | 3 ++-
>  net/core/filter.c| 6 ++
>  2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e2853aa..7108711 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2214,7 +2214,7 @@ struct bpf_tunnel_key {
> };
> __u8 tunnel_tos;
> __u8 tunnel_ttl;
> -   __u16 tunnel_ext;
> +   __u16 tunnel_ext;   /* Padding, future use. */
> __u32 tunnel_label;
>  };
>
> @@ -2225,6 +2225,7 @@ struct bpf_xfrm_state {
> __u32 reqid;
> __u32 spi;  /* Stored in network byte order */
> __u16 family;
> +   __u16 ext;  /* Padding, future use. */
> union {
> __u32 remote_ipv4;  /* Stored in network byte order */
> __u32 remote_ipv6[4];   /* Stored in network byte order */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 717c740..5ceb5e6 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3445,6 +3445,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, 
> skb, struct bpf_tunnel_key
> to->tunnel_id = be64_to_cpu(info->key.tun_id);
> to->tunnel_tos = info->key.tos;
> to->tunnel_ttl = info->key.ttl;
> +   to->tunnel_ext = 0;
>
> if (flags & BPF_F_TUNINFO_IPV6) {
> memcpy(to->remote_ipv6, >key.u.ipv6.src,
> @@ -3452,6 +3453,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, 
> skb, struct bpf_tunnel_key
> to->tunnel_label = be32_to_cpu(info->key.label);
> } else {
> to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
> +   memset(>remote_ipv6[1], 0, sizeof(__u32) * 3);
> +   to->tunnel_label = 0;
> }
>
> if (unlikely(size != sizeof(struct bpf_tunnel_key)))
> @@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, 
> skb, u32, index,
> to->reqid = x->props.reqid;
> to->spi = x->id.spi;
> to->family = x->props.family;
> +   to->ext = 0;
> +
> if (to->family == AF_INET6) {
> memcpy(to->remote_ipv6, x->props.saddr.a6,
>sizeof(to->remote_ipv6));
> } else {
> to->remote_ipv4 = x->props.saddr.a4;
> +   memset(>remote_ipv6[1], 0, sizeof(__u32) * 3);
> }
>
> return 0;
> --
> 2.9.5
>


Re: [PATCH bpf-next 05/11] bpf: avoid retpoline for lookup/update/delete calls on maps

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> While some of the BPF map lookup helpers provide a ->map_gen_lookup()
> callback for inlining the map lookup altogether it is not available
> for every map, so the remaining ones have to call bpf_map_lookup_elem()
> helper which does a dispatch to map->ops->map_lookup_elem(). In
> times of retpolines, this will control and trap speculative execution
> rather than letting it do its work for the indirect call and will
> therefore cause a slowdown. Likewise, bpf_map_update_elem() and
> bpf_map_delete_elem() do not have an inlined version and need to call
> into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
> handlers.
>
> Before:
>
>   # bpftool p d x i 1
> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#232656
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
>16: (95) exit helper
>
> After:
>
>   # bpftool p d x i 1
> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#233328
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
>16: (95) exit
>
> In all three lookup/update/delete cases however we can use the actual
> address of the map callback directly if we find that there's only a
> single path with a map pointer leading to the helper call, meaning
> when the map pointer has not been poisoned from verifier side.
> Example code can be seen above for the delete case.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/linux/filter.h |  3 +++
>  kernel/bpf/hashtab.c   | 12 ++---
>  kernel/bpf/verifier.c  | 67 
> +-
>  3 files changed, 62 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index b443f70..d407ede 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -301,6 +301,9 @@ struct xdp_buff;
>
>  /* Function call */
>
> +#define BPF_CAST_CALL(x)   \
> +   ((u64 (*)(u64, u64, u64, u64, u64))(x))
> +
>  #define BPF_EMIT_CALL(FUNC)\
> ((struct bpf_insn) {\
> .code  = BPF_JMP | BPF_CALL,\
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index b76828f..3ca2198 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, 
> struct bpf_insn *insn_buf)
> struct bpf_insn *insn = insn_buf;
> const int ret = BPF_REG_0;
>
> -   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
> u64))__htab_map_lookup_elem);
> +   BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +(void *(*)(struct bpf_map *map, void *key))NULL));
> +   *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
> *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
> *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
> offsetof(struct htab_elem, key) +
> @@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
> const int ret = BPF_REG_0;
> const int ref_reg = BPF_REG_1;
>
> -   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
> u64))__htab_map_lookup_elem);
> +   BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +(void *(*)(struct bpf_map *map, void *key))NULL));
> +   *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
> *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
> *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
>   offsetof(struct htab_elem, lru_node) +
> @@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
> struct bpf_insn *insn = insn_buf;
> const int ret = BPF_REG_0;
>
> -   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
> u64))__htab_map_lookup_elem);
> +   BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +(void *(*)(struct bpf_map *map, void *key))NULL));
> +   *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
> *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
> 

[PATCH iproute2 net-next] iproute: ip route get support for sport, dport and ipproto match

2018-05-30 Thread Roopa Prabhu
From: Roopa Prabhu 

Signed-off-by: Roopa Prabhu 
---
 ip/iproute.c   | 26 +-
 man/man8/ip-route.8.in | 20 +++-
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 56dd9f2..ef04d27 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -69,7 +69,8 @@ static void usage(void)
"[ from ADDRESS iif STRING ]\n"
"[ oif STRING ] [ tos TOS ]\n"
"[ mark NUMBER ] [ vrf NAME ]\n"
-   "[ uid NUMBER ]\n"
+   "[ uid NUMBER ] [ ipproto PROTOCOL 
]\n"
+   "[ sport NUMBER ] [ dport NUMBER 
]\n"
"   ip route { add | del | change | append | replace } 
ROUTE\n"
"SELECTOR := [ root PREFIX ] [ match PREFIX ] [ exact PREFIX 
]\n"
"[ table TABLE_ID ] [ vrf NAME ] [ proto RTPROTO 
]\n"
@@ -1994,6 +1995,29 @@ static int iproute_get(int argc, char **argv)
req.r.rtm_family = addr.family;
addattr_l(, sizeof(req), RTA_NEWDST,
  , addr.bytelen);
+   } else if (matches(*argv, "sport") == 0) {
+   __be16 sport;
+
+   NEXT_ARG();
+   if (get_be16(, *argv, 0))
+   invarg("invalid sport\n", *argv);
+   addattr16(, sizeof(req), RTA_SPORT, sport);
+   } else if (matches(*argv, "dport") == 0) {
+   __be16 dport;
+
+   NEXT_ARG();
+   if (get_be16(, *argv, 0))
+   invarg("invalid dport\n", *argv);
+   addattr16(, sizeof(req), RTA_DPORT, dport);
+   } else if (matches(*argv, "ipproto") == 0) {
+   int ipproto;
+
+   NEXT_ARG();
+   ipproto = inet_proto_a2n(*argv);
+   if (ipproto < 0)
+   invarg("Invalid \"ipproto\" value\n",
+  *argv);
+   addattr8(, sizeof(req), RTA_IP_PROTO, ipproto);
} else {
inet_prefix addr;
 
diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index b28f3d2..b21a847 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -38,7 +38,13 @@ ip-route \- routing table management
 .B  tos
 .IR TOS " ] [ "
 .B  vrf
-.IR NAME " ] "
+.IR NAME " ] [ "
+.B  ipproto
+.IR PROTOCOL " ] [ "
+.B  sport
+.IR NUMBER " ] [ "
+.B  dport
+.IR NUMBER " ] "
 
 .ti -8
 .BR "ip route" " { " add " | " del " | " change " | " append " | "\
@@ -1045,6 +1051,18 @@ the firewall mark
 force the vrf device on which this packet will be routed.
 
 .TP
+.BI ipproto " PROTOCOL"
+ip protocol as seen by the route lookup
+
+.TP
+.BI sport " NUMBER"
+source port as seen by the route lookup
+
+.TP
+.BI dport " NUMBER"
+destination port as seen by the route lookup
+
+.TP
 .B connected
 if no source address
 .RB "(option " from ")"
-- 
2.1.4



Re: [PATCH bpf-next 09/11] bpf: fix context access in tracing progs on 32 bit archs

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
> program type in test_verifier report the following errors on x86_32:
>
>   172/p unpriv: spill/fill of different pointers ldx FAIL
>   Unexpected error message!
>   0: (bf) r6 = r10
>   1: (07) r6 += -8
>   2: (15) if r1 == 0x0 goto pc+3
>   R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
>   3: (bf) r2 = r10
>   4: (07) r2 += -76
>   5: (7b) *(u64 *)(r6 +0) = r2
>   6: (55) if r1 != 0x0 goto pc+1
>   R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 
> fp-8=fp
>   7: (7b) *(u64 *)(r6 +0) = r1
>   8: (79) r1 = *(u64 *)(r6 +0)
>   9: (79) r1 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
>   378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (71) r0 = *(u8 *)(r1 +68)
>   invalid bpf_context access off=68 size=1
>
>   379/p check bpf_perf_event_data->sample_period half load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (69) r0 = *(u16 *)(r1 +68)
>   invalid bpf_context access off=68 size=2
>
>   380/p check bpf_perf_event_data->sample_period word load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (61) r0 = *(u32 *)(r1 +68)
>   invalid bpf_context access off=68 size=4
>
>   381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (79) r0 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
> Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
> boundary due to its size of 68 bytes.
>
> Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that
> off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the
> case of sample_period access from struct bpf_perf_event_data, hence
> verifier wrongly thinks we might be doing an unaligned access here.
> Therefore adjust this down to machine size and check the offset for
> narrow access on that basis.
>
> We also need to fix pe_prog_is_valid_access(), since we hit the check
> for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test.
>
> Reported-by: Wang YanQing 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 
> ---
>  include/linux/filter.h   | 30 --
>  kernel/trace/bpf_trace.c | 10 --
>  2 files changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d407ede..89903d2 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct 
> bpf_prog *prog)
> return prog->type == BPF_PROG_TYPE_UNSPEC;
>  }
>
> -static inline bool
> -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
> +static inline u32 bpf_ctx_off_adjust_machine(u32 size)
> +{
> +   const u32 size_machine = sizeof(unsigned long);
> +
> +   if (size > size_machine && size % size_machine == 0)
> +   size = size_machine;

Not sure whether I understand this correctly. I guess we only need:
if (size % size_machine == 0)
   size = size_machine;

Or, is this function equivalent to
if (size == 8 && size_machine == 4)
 size = 4;

If this is the case, maybe we can make bpf_ctx_narrow_align_ok()
simpler?

Thanks,
Song

> +
> +   return size;
> +}
> +
> +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
> +  u32 size_default)
>  {
> -   bool off_ok;
> +   size_default = bpf_ctx_off_adjust_machine(size_default);
> +   size_access  = bpf_ctx_off_adjust_machine(size_access);
> +
>  #ifdef __LITTLE_ENDIAN
> -   off_ok = (off & (size_default - 1)) == 0;
> +   return (off & (size_default - 1)) == 0;
>  #else
> -   off_ok = (off & (size_default - 1)) + size == size_default;
> +   return (off & (size_default - 1)) + size_access == size_default;
>  #endif
> -   return off_ok && size <= size_default && (size & (size - 1)) == 0;
> +}
> +
> +static inline bool
> +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
> +{
> +   return bpf_ctx_narrow_align_ok(off, size, size_default) &&
> +  size <= size_default && (size & (size - 1)) == 0;
>  }
>
>  #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 81fdf2f..7269530 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -880,8 +880,14 @@ static bool pe_prog_is_valid_access(int off, int size, 
> enum bpf_access_type type
> return false;
> if (type != BPF_READ)
> return false;
> -   if (off % size != 0)
> -   return false;
> +   if (off % 

[PATCH] samples/bpf: Add xdp_sample_pkts example

2018-05-30 Thread Toke Høiland-Jørgensen
This adds an example program showing how to sample packets from XDP using
the perf event buffer. The example userspace program just prints the
ethernet header for every packet sampled.

Most of the userspace code is borrowed from other examples, most notably
trace_output.

Note that the example only works when everything runs on CPU0; so
suitable smp_affinity needs to be set on the device. Some drivers seem
to reset smp_affinity when loading an XDP program, so it may be
necessary to change it after starting the example userspace program.

Signed-off-by: Toke Høiland-Jørgensen 
---
 samples/bpf/Makefile   |   4 +
 samples/bpf/xdp_sample_pkts_kern.c |  48 
 samples/bpf/xdp_sample_pkts_user.c | 147 +
 3 files changed, 199 insertions(+)
 create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
 create mode 100644 samples/bpf/xdp_sample_pkts_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af1..6f0c6d2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := bpf_load.o xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/xdp_sample_pkts_kern.c 
b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000..c58183a
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,48 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+
+struct bpf_map_def SEC("maps") my_map = {
+   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+   .key_size = sizeof(int),
+   .value_size = sizeof(u32),
+   .max_entries = 2,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   void *data = (void *)(long)ctx->data;
+
+/* Metadata will be in the perf event before the packet data. */
+   struct S {
+   u16 cookie;
+   u16 pkt_len;
+   } __attribute__((packed)) metadata;
+
+   if (data + SAMPLE_SIZE < data_end) {
+   /* The XDP perf_event_output handler will use the upper 32 bits
+* of the flags argument as a number of bytes to include of the
+* packet payload in the event data. If the size is too big, the
+* call to bpf_perf_event_output will fail and return -EFAULT.
+*
+* See bpf_xdp_event_output in net/core/filter.c.
+*/
+   u64 flags = SAMPLE_SIZE << 32;
+
+   metadata.cookie = 0xdead;
+   metadata.pkt_len = (u16)(data_end - data);
+
+   bpf_perf_event_output(ctx, _map, flags,
+ , sizeof(metadata));
+   }
+
+   return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c 
b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000..f996917
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,147 @@
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include 
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+static int pmu_fd, if_idx = 0;
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+   int err;
+
+   err = bpf_set_link_xdp_fd(idx, fd, 0);
+   if (err < 0)
+   printf("ERROR: failed to attach 

Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-30 Thread Andrew Lunn
On Wed, May 30, 2018 at 03:14:20PM +, Saeed Mahameed wrote:
> On Wed, 2018-05-30 at 03:07 +0200, Andrew Lunn wrote:
> > On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> > > From: Ilan Tayari 
> > > 
> > > The FPGA QP event fires whenever a QP on the FPGA trasitions
> > > to the error state.
> > 
> > FPGA i know, field programmable gate array. Could you offer some clue
> > as to what QP means?
> > 
> 
> Hi Andre, QP "Queue Pair" is well known rdma concept, and widely used
> in mlx drivers, it is used as in the driver as a ring buffer to
> communicate with the FPGA device.

O.K. Thanks.

It is hard to get the right balance between assuming people know
everything, and know nothing. But you can help teach people these
terms i commit messages:

  The FPGA queue pair event fires whenever a QP on the FPGA
  transitions to the error state.

   Andrew


Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-30 Thread Andrew Lunn
> Hi Andrew, yes the temperature is available by other means, this patch
> is needed for alert information reasons in order to know which internal
> sensors triggered the alarm.
> We are working in parallel to expose temperature sensor to hwmon, but
> this is still WIP.
> 
> 
> Is it ok to have both ?

Hi Saeed

Ideally no. hwmon has mechanisms for setting alarm thresholds, and
indicating the thresholds have been exceeded. There are also ways to
tie this to thermal zones, so the system can react on overheating,
slow down the CPU, drop voltages, ramp up fans, etc. hwmon should be
your primary interface, not dmesg.

But if you are stuck doing things in the wrong order, i guess it is
O.K. I don't think dmesg is a Binary API, so you can remove it later.

 Andrew


Re: [PATCH RESEND rdma-next] net/mlx5: Use flow counter pointer as input to the query function

2018-05-30 Thread Saeed Mahameed
On Wed, 2018-05-30 at 09:44 +0300, Or Gerlitz wrote:
> This allows to un-expose the details of struct mlx5_fc and keep
> it internal to the core driver as it used to be.
> 
> Signed-off-by: Or Gerlitz 
> ---
> 
> Jason,
> 
> As you asked, I am sending a fixup in case you intend to apply
> V2 of the flow counter series [1], if there's going to be V3,
> Leon, please apply it from the begining.
> 
> Fixed Jason's address @ my git aliases, he's with MLNX by now..
> 
> Or.
> 
> [1] https://marc.info/?l=linux-netdev=152759937829994=2
> 
>  drivers/infiniband/hw/mlx5/main.c  |  2 +-
>  drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 15 ++
> 
>  drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  | 22
> +---
>  .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |  4 ++--
>  include/linux/mlx5/fs.h| 24 --


I like this patch, this should go into mlx5-next tree though, along
with "net/mlx5: Export flow counter related API"

> 
>  5 files changed, 32 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/main.c
> b/drivers/infiniband/hw/mlx5/main.c
> index ac99125..4b09dcd 100644
> --- a/drivers/infiniband/hw/mlx5/main.c
> +++ b/drivers/infiniband/hw/mlx5/main.c
> @@ -3151,7 +3151,7 @@ static int read_flow_counters(struct ib_device
> *ibdev,
>   struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
>   struct mlx5_ib_dev *dev = to_mdev(ibdev);
>  
> - return mlx5_fc_query(dev->mdev, fc->id,
> + return mlx5_fc_query(dev->mdev, fc,
>_attr->out[IB_COUNTER_PACKETS],
>_attr->out[IB_COUNTER_BYTES]);
>  }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> index 6cab1dd..f63dfbc 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> @@ -2104,21 +2104,18 @@ static int
> mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
>   struct mlx5_vport *vport = >vports[vport_idx];
>   u64 rx_discard_vport_down, tx_discard_vport_down;
>   u64 bytes = 0;
> - u16 idx = 0;
>   int err = 0;
>  
>   if (!vport->enabled || esw->mode != SRIOV_LEGACY)
>   return 0;
>  
> - if (vport->egress.drop_counter) {
> - idx = vport->egress.drop_counter->id;
> - mlx5_fc_query(dev, idx, >rx_dropped, );
> - }
> + if (vport->egress.drop_counter)
> + mlx5_fc_query(dev, vport->egress.drop_counter,
> +   >rx_dropped, );
>  
> - if (vport->ingress.drop_counter) {
> - idx = vport->ingress.drop_counter->id;
> - mlx5_fc_query(dev, idx, >tx_dropped, );
> - }
> + if (vport->ingress.drop_counter)
> + mlx5_fc_query(dev, vport->ingress.drop_counter,
> +   >tx_dropped, );
>  
>   if (!MLX5_CAP_GEN(dev, receive_discard_vport_down) &&
>   !MLX5_CAP_GEN(dev, transmit_discard_vport_down))
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
> b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
> index 40992ae..0211d77 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
> @@ -131,6 +131,25 @@ struct mlx5_flow_table {
>   struct rhltable fgs_hash;
>  };
>  
> +struct mlx5_fc_cache {
> + u64 packets;
> + u64 bytes;
> + u64 lastuse;
> +};
> +
> +struct mlx5_fc {
> + struct rb_node node;
> + struct list_head list;
> +
> + u64 lastpackets;
> + u64 lastbytes;
> +
> + u32 id;
> + bool deleted;
> + bool aging;
> + struct mlx5_fc_cache cache cacheline_aligned_in_smp;
> +};
> +
>  struct mlx5_ft_underlay_qp {
>   struct list_head list;
>   u32 qpn;
> @@ -210,9 +229,6 @@ void mlx5_fc_queue_stats_work(struct
> mlx5_core_dev *dev,
> unsigned long delay);
>  void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
> unsigned long interval);
> -int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
> -   u64 *packets, u64 *bytes);
> -
>  int mlx5_init_fs(struct mlx5_core_dev *dev);
>  void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
>  
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
> b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
> index 10f4078..58af6be 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
> @@ -314,10 +314,10 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev
> *dev)
>   }
>  }
>  
> -int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
> +int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc
> *counter,
> u64 *packets, u64 *bytes)
>  {
> - return mlx5_cmd_fc_query(dev, id, packets, 

Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo

2018-05-30 Thread Song Liu
On Tue, May 29, 2018 at 12:55 PM, Daniel Borkmann  wrote:
> On 05/29/2018 07:27 PM, Jesper Dangaard Brouer wrote:
>> On Mon, 28 May 2018 02:43:37 +0200
>> Daniel Borkmann  wrote:
>>
>>> Its trivial and straight forward to expose it for scripts that can
>>> then use it along with bpftool in order to inspect an individual
>>> application's used maps and progs. Right now we dump some basic
>>> information in the fdinfo file but with the help of the map/prog
>>> id full introspection becomes possible now.
>>>
>>> Signed-off-by: Daniel Borkmann 
>>> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

>>
>> AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
>> if this change is handled gracefully by tc ?
>
> Yep, it works just fine, I also tested it before submission.


Re: [PATCH bpf-next 10/11] bpf: sync bpf uapi header with tools

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Pull in recent changes from include/uapi/linux/bpf.h.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  tools/include/uapi/linux/bpf.h | 20 ++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 9b8c6e3..7108711 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2004,6 +2004,20 @@ union bpf_attr {
>   * direct packet access.
>   * Return
>   * 0 on success, or a negative error in case of failure.
> + *
> + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
> + * Description
> + * Return the cgroup v2 id of the socket associated with the 
> *skb*.
> + * This is roughly similar to the **bpf_get_cgroup_classid**\ ()
> + * helper for cgroup v1 by providing a tag resp. identifier that
> + * can be matched on or used for map lookups e.g. to implement
> + * policy. The cgroup v2 id of a given path in the hierarchy is
> + * exposed in user space through the f_handle API in order to get
> + * to the same 64-bit id.
> + *
> + * This helper can be used on TC egress path, but not on ingress.
> + * Return
> + * The id is returned or 0 in case the id could not be retrieved.
>   */
>  #define __BPF_FUNC_MAPPER(FN)  \
> FN(unspec), \
> @@ -2082,7 +2096,8 @@ union bpf_attr {
> FN(lwt_push_encap), \
> FN(lwt_seg6_store_bytes),   \
> FN(lwt_seg6_adjust_srh),\
> -   FN(lwt_seg6_action),
> +   FN(lwt_seg6_action),\
> +   FN(skb_cgroup_id),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2199,7 +2214,7 @@ struct bpf_tunnel_key {
> };
> __u8 tunnel_tos;
> __u8 tunnel_ttl;
> -   __u16 tunnel_ext;
> +   __u16 tunnel_ext;   /* Padding, future use. */
> __u32 tunnel_label;
>  };
>
> @@ -2210,6 +2225,7 @@ struct bpf_xfrm_state {
> __u32 reqid;
> __u32 spi;  /* Stored in network byte order */
> __u16 family;
> +   __u16 ext;  /* Padding, future use. */
> union {
> __u32 remote_ipv4;  /* Stored in network byte order */
> __u32 remote_ipv6[4];   /* Stored in network byte order */
> --
> 2.9.5
>


Re: [RFC net-next 0/4] net: sched: support replay of filter offload when binding to block

2018-05-30 Thread Or Gerlitz
On Mon, May 28, 2018 at 11:02 PM, Jakub Kicinski
 wrote:
> On Mon, 28 May 2018 13:48:28 +0300, Or Gerlitz wrote:
>> On Fri, May 25, 2018 at 5:25 AM, Jakub Kicinski wrote:
>> > This series from John adds the ability to replay filter offload requests
>> > when new offload callback is being registered on a TC block.  This is most
>> > likely to take place for shared blocks today, when a block which already
>> > has rules is bound to another interface.  Prior to this patch set if any
>> > of the rules were offloaded the block bind would fail.
>>
>> Can you elaborate a little further here? is this something that you are 
>> planning
>> to use for the uplink LAG use-case? AFAIU if we apply share-block to nfp as
>> things are prior to this patch, it would work, so there's a case where
>> it doesn't and this is now handled with the series?
>
> Just looking at things as they stand today, no bond/forward looking
> plans - nfp "supports" shared blocks by registering multiple callbacks
> to the block.  There are two problems:
>
> (a) one can't install a second callback if some rules are already
> offloaded because of:
>
> /* At this point, playback of previous block cb calls is not 
> supported,
>  * so forbid to register to block which already has some offloaded
>  * filters present.
>  */
> if (tcf_block_offload_in_use(block))
> return ERR_PTR(-EOPNOTSUPP);
>
> in __tcf_block_cb_register(), so block sharing has to be set up
> before any rules are added.
>
> (b) when block is unshared filters are not removed today and driver
> would have to sweep its rule table, as John notes.  It's not a big
> deal but this series fixes it nicely in the core, too.

OK, thanks for these two point clarifications, much helpful.

> Looking forward there are two things we can use shared blocks for: we
> can try to teach user space to share ingress blocks on all legs of bonds
> instead of trying to propagate the rules from the bond down in the
> kernel, which is more tricky to get right.  We will need reliable
> replay for that, because we want new links to be able to join and leave
> the bond when rules are already present.

> Second use case, which is more far fetched, is trying to discover and
> register callbacks for blocks of tunnel devices directly, and avoid the
> egdev infrastructure...

> We should discuss the above further, but regardless, I think this
> patchset is quite a nice addition on it's own.  Would you agree?

yes, it sounds good, but I need to look deeper, a bit behind on that :(

Or.


Re: [PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-30 Thread Jason Gunthorpe
On Wed, May 30, 2018 at 06:24:00PM +0300, Yishai Hadas wrote:
> On 5/30/2018 6:06 PM, Jason Gunthorpe wrote:
> >On Wed, May 30, 2018 at 03:31:34PM +0300, Yishai Hadas wrote:
> >>On 5/29/2018 10:56 PM, Jason Gunthorpe wrote:
> >>>On Tue, May 29, 2018 at 04:09:15PM +0300, Leon Romanovsky wrote:
> diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
> index 508ea8c82da7..ef3f430a7050 100644
> +++ b/include/uapi/rdma/mlx5-abi.h
> @@ -443,4 +443,18 @@ enum {
>   enum {
>   MLX5_IB_CLOCK_INFO_V1  = 0,
>   };
> +
> +struct mlx5_ib_flow_counters_data {
> + __aligned_u64   counters_data;
> + __u32   ncounters;
> + __u32   reserved;
> +};
> +
> +struct mlx5_ib_create_flow {
> + __u32   ncounters_data;
> + __u32   reserved;
> + /* Following are counters data based on ncounters_data */
> + struct mlx5_ib_flow_counters_data data[];
> +};
> +
>   #endif /* MLX5_ABI_USER_H */
> >>>
> >>>This uapi thing still needs to be fixed as I pointed out before.
> >>
> >>In V3 we can go with below, no change in memory layout but it can clarify
> >>the code/usage.
> >>
> >>struct mlx5_ib_flow_counters_desc {
> >> __u32   description;
> >> __u32   index;
> >>};
> >>
> >>struct mlx5_ib_flow_counters_data {
> >> RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data);
> >> __u32   ncounters;
> >> __u32   reserved;
> >>};
> >
> >OK, this is what I asked for originally..
> >
> >>struct mlx5_ib_create_flow {
> >> __u32   ncounters_data;
> >> __u32   reserved;
> >> /* Following are counters data based on ncounters_data */
> >> struct mlx5_ib_flow_counters_data data[];
> >>
> >>
> >>>I still can't figure out why this should be a 2d array.
> >>
> >>This comes to support the future case of multiple counters objects/specs
> >>passed with the same flow. There is a need to differentiate mapping data for
> >>each counters object and that is done via the 'ncounters_data' field and the
> >>2d array.
> >
> >This still doesn't make any sense to me. How are these multiple
> >counters objects/specs going to be identified?
> >
> >Basically, what does the array index for data[] mean? Should it match
> >the spec index from the main verb or something?
> >
> 
> Each entry in the data[] should match a corresponding counter object that
> was pointed by a counters spec upon the flow creation.

What if there are a mixture of specs, some with counters and some
without?

The index is just matching the index of the spec? That makes sense.

> >This is a good place for a comment, since the intention is completely
> >opaque here.
> 
> Sure, we'll add comment to clarify the above.

Sure, can leave the flex array then too

Jason


[PATCH] b53: Add brcm5389 support

2018-05-30 Thread Damien Thébault
This patch adds support for the BCM5389 switch connected through MDIO.

Signed-off-by: Damien Thébault 
---
 drivers/net/dsa/b53/b53_common.c | 13 +
 drivers/net/dsa/b53/b53_mdio.c   |  5 -
 drivers/net/dsa/b53/b53_priv.h   |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 78616787f2a3..3da5fca77cbd 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1711,6 +1711,18 @@ static const struct b53_chip_data b53_switch_chips[] = {
.cpu_port = B53_CPU_PORT_25,
.duplex_reg = B53_DUPLEX_STAT_FE,
},
+   {
+   .chip_id = BCM5389_DEVICE_ID,
+   .dev_name = "BCM5389",
+   .vlans = 4096,
+   .enabled_ports = 0x1f,
+   .arl_entries = 4,
+   .cpu_port = B53_CPU_PORT,
+   .vta_regs = B53_VTA_REGS,
+   .duplex_reg = B53_DUPLEX_STAT_GE,
+   .jumbo_pm_reg = B53_JUMBO_PORT_MASK,
+   .jumbo_size_reg = B53_JUMBO_MAX_SIZE,
+   },
{
.chip_id = BCM5395_DEVICE_ID,
.dev_name = "BCM5395",
@@ -2034,6 +2046,7 @@ int b53_switch_detect(struct b53_device *dev)
else
dev->chip_id = BCM5365_DEVICE_ID;
break;
+   case BCM5389_DEVICE_ID:
case BCM5395_DEVICE_ID:
case BCM5397_DEVICE_ID:
case BCM5398_DEVICE_ID:
diff --git a/drivers/net/dsa/b53/b53_mdio.c b/drivers/net/dsa/b53/b53_mdio.c
index fa7556f5d4fb..a533a90e3904 100644
--- a/drivers/net/dsa/b53/b53_mdio.c
+++ b/drivers/net/dsa/b53/b53_mdio.c
@@ -285,6 +285,7 @@ static const struct b53_io_ops b53_mdio_ops = {
 #define B53_BRCM_OUI_1 0x0143bc00
 #define B53_BRCM_OUI_2 0x03625c00
 #define B53_BRCM_OUI_3 0x00406000
+#define B53_BRCM_OUI_4 0x01410c00
 
 static int b53_mdio_probe(struct mdio_device *mdiodev)
 {
@@ -311,7 +312,8 @@ static int b53_mdio_probe(struct mdio_device *mdiodev)
 */
if ((phy_id & 0xfc00) != B53_BRCM_OUI_1 &&
(phy_id & 0xfc00) != B53_BRCM_OUI_2 &&
-   (phy_id & 0xfc00) != B53_BRCM_OUI_3) {
+   (phy_id & 0xfc00) != B53_BRCM_OUI_3 &&
+   (phy_id & 0xfc00) != B53_BRCM_OUI_4) {
dev_err(>dev, "Unsupported device: 0x%08x\n", phy_id);
return -ENODEV;
}
@@ -360,6 +362,7 @@ static const struct of_device_id b53_of_match[] = {
{ .compatible = "brcm,bcm53125" },
{ .compatible = "brcm,bcm53128" },
{ .compatible = "brcm,bcm5365" },
+   { .compatible = "brcm,bcm5389" },
{ .compatible = "brcm,bcm5395" },
{ .compatible = "brcm,bcm5397" },
{ .compatible = "brcm,bcm5398" },
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 1187ebd79287..3b57f47d0e79 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -48,6 +48,7 @@ struct b53_io_ops {
 enum {
BCM5325_DEVICE_ID = 0x25,
BCM5365_DEVICE_ID = 0x65,
+   BCM5389_DEVICE_ID = 0x89,
BCM5395_DEVICE_ID = 0x95,
BCM5397_DEVICE_ID = 0x97,
BCM5398_DEVICE_ID = 0x98,
-- 
2.17.0


[PATCH v2 iproute2-next] ip route: print RTA_CACHEINFO if it exists

2018-05-30 Thread dsahern
From: David Ahern 

RTA_CACHEINFO can be sent for non-cloned routes. If the attribute is
present print it. Allows route dumps to print expires times for example
which can exist on FIB entries.

Signed-off-by: David Ahern 
---
v2
- leave print_cache_flags under r->rtm_flags & RTM_F_CLONED check

 ip/iproute.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 56dd9f25e38e..254d7abd2abf 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -899,17 +899,14 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
   rta_getattr_u32(tb[RTA_UID]));
 
if (r->rtm_family == AF_INET) {
-   if (r->rtm_flags & RTM_F_CLONED) {
+   if (r->rtm_flags & RTM_F_CLONED)
print_cache_flags(fp, r->rtm_flags);
 
-   if (tb[RTA_CACHEINFO])
-   print_rta_cacheinfo(fp, 
RTA_DATA(tb[RTA_CACHEINFO]));
-   }
+   if (tb[RTA_CACHEINFO])
+   print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
} else if (r->rtm_family == AF_INET6) {
-   if (r->rtm_flags & RTM_F_CLONED) {
-   if (tb[RTA_CACHEINFO])
-   print_rta_cacheinfo(fp, 
RTA_DATA(tb[RTA_CACHEINFO]));
-   }
+   if (tb[RTA_CACHEINFO])
+   print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
}
 
if (tb[RTA_METRICS])
-- 
2.11.0



Re: [PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-30 Thread Yishai Hadas

On 5/30/2018 6:06 PM, Jason Gunthorpe wrote:

On Wed, May 30, 2018 at 03:31:34PM +0300, Yishai Hadas wrote:

On 5/29/2018 10:56 PM, Jason Gunthorpe wrote:

On Tue, May 29, 2018 at 04:09:15PM +0300, Leon Romanovsky wrote:

diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 508ea8c82da7..ef3f430a7050 100644
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -443,4 +443,18 @@ enum {
  enum {
MLX5_IB_CLOCK_INFO_V1  = 0,
  };
+
+struct mlx5_ib_flow_counters_data {
+   __aligned_u64   counters_data;
+   __u32   ncounters;
+   __u32   reserved;
+};
+
+struct mlx5_ib_create_flow {
+   __u32   ncounters_data;
+   __u32   reserved;
+   /* Following are counters data based on ncounters_data */
+   struct mlx5_ib_flow_counters_data data[];
+};
+
  #endif /* MLX5_ABI_USER_H */


This uapi thing still needs to be fixed as I pointed out before.


In V3 we can go with below, no change in memory layout but it can clarify
the code/usage.

struct mlx5_ib_flow_counters_desc {
 __u32   description;
 __u32   index;
};

struct mlx5_ib_flow_counters_data {
 RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data);
 __u32   ncounters;
 __u32   reserved;
};


OK, this is what I asked for originally..


struct mlx5_ib_create_flow {
 __u32   ncounters_data;
 __u32   reserved;
 /* Following are counters data based on ncounters_data */
 struct mlx5_ib_flow_counters_data data[];



I still can't figure out why this should be a 2d array.


This comes to support the future case of multiple counters objects/specs
passed with the same flow. There is a need to differentiate mapping data for
each counters object and that is done via the 'ncounters_data' field and the
2d array.


This still doesn't make any sense to me. How are these multiple
counters objects/specs going to be identified?

Basically, what does the array index for data[] mean? Should it match
the spec index from the main verb or something?



Each entry in the data[] should match a corresponding counter object 
that was pointed by a counters spec upon the flow creation.



This is a good place for a comment, since the intention is completely
opaque here.


Sure, we'll add comment to clarify the above.


Re: [PATCH iproute2-next] ipaddress: Add support for address metric

2018-05-30 Thread David Ahern
On 5/27/18 9:10 AM, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> Add support for IFA_RT_PRIORITY using the same keywords as iproute for
> RTA_PRIORITY.
> 
> Signed-off-by: David Ahern 
> ---
>  include/uapi/linux/if_addr.h |  1 +
>  ip/ipaddress.c   | 15 ++-
>  man/man8/ip-address.8.in |  6 ++
>  3 files changed, 21 insertions(+), 1 deletion(-)

applied to iproute2-next.


Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event

2018-05-30 Thread Saeed Mahameed
On Wed, 2018-05-30 at 03:07 +0200, Andrew Lunn wrote:
> On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> > From: Ilan Tayari 
> > 
> > The FPGA QP event fires whenever a QP on the FPGA trasitions
> > to the error state.
> 
> FPGA i know, field programmable gate array. Could you offer some clue
> as to what QP means?
> 

Hi Andre, QP "Queue Pair" is well known rdma concept, and widely used
in mlx drivers, it is used as in the driver as a ring buffer to
communicate with the FPGA device.
 
>Thanks
>   Andrew

Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log

2018-05-30 Thread Saeed Mahameed
On Wed, 2018-05-30 at 03:04 +0200, Andrew Lunn wrote:
> On Tue, May 29, 2018 at 05:19:53PM -0700, Saeed Mahameed wrote:
> > From: Ilan Tayari 
> > 
> > Temperature warning event is sent by FW to indicate high
> > temperature
> > as detected by one of the sensors on the board.
> > Add handling of this event by writing the numbers of the alert
> > sensors
> > to the kernel log.
> 
> Hi Saaed
> 
> Is the temperature itself available? If so, it would be better to
> expose this as a hwmon device per temperature sensor.
> 

Hi Andrew, yes the temperature is available by other means, this patch
is needed for alert information reasons in order to know which internal
sensors triggered the alarm.
We are working in parallel to expose temperature sensor to hwmon, but
this is still WIP.


Is it ok to have both ?

>Andrew

Re: [PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-30 Thread Jason Gunthorpe
On Wed, May 30, 2018 at 03:31:34PM +0300, Yishai Hadas wrote:
> On 5/29/2018 10:56 PM, Jason Gunthorpe wrote:
> >On Tue, May 29, 2018 at 04:09:15PM +0300, Leon Romanovsky wrote:
> >>diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
> >>index 508ea8c82da7..ef3f430a7050 100644
> >>+++ b/include/uapi/rdma/mlx5-abi.h
> >>@@ -443,4 +443,18 @@ enum {
> >>  enum {
> >>MLX5_IB_CLOCK_INFO_V1  = 0,
> >>  };
> >>+
> >>+struct mlx5_ib_flow_counters_data {
> >>+   __aligned_u64   counters_data;
> >>+   __u32   ncounters;
> >>+   __u32   reserved;
> >>+};
> >>+
> >>+struct mlx5_ib_create_flow {
> >>+   __u32   ncounters_data;
> >>+   __u32   reserved;
> >>+   /* Following are counters data based on ncounters_data */
> >>+   struct mlx5_ib_flow_counters_data data[];
> >>+};
> >>+
> >>  #endif /* MLX5_ABI_USER_H */
> >
> >This uapi thing still needs to be fixed as I pointed out before.
> 
> In V3 we can go with below, no change in memory layout but it can clarify
> the code/usage.
> 
> struct mlx5_ib_flow_counters_desc {
> __u32   description;
> __u32   index;
> };
> 
> struct mlx5_ib_flow_counters_data {
> RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data);
> __u32   ncounters;
> __u32   reserved;
> };

OK, this is what I asked for originally..

> struct mlx5_ib_create_flow {
> __u32   ncounters_data;
> __u32   reserved;
> /* Following are counters data based on ncounters_data */
> struct mlx5_ib_flow_counters_data data[];
> 
> 
> >I still can't figure out why this should be a 2d array.
> 
> This comes to support the future case of multiple counters objects/specs
> passed with the same flow. There is a need to differentiate mapping data for
> each counters object and that is done via the 'ncounters_data' field and the
> 2d array.

This still doesn't make any sense to me. How are these multiple
counters objects/specs going to be identified?

Basically, what does the array index for data[] mean? Should it match
the spec index from the main verb or something?

This is a good place for a comment, since the intention is completely
opaque here.

> >A flex array at the end of a struct means that the struct can never be
> >extended again which seems like a terrible idea,
> 
> The header [1] has a fixed size and will always exist even if there will be
> no counters. Future extensions [2] will be added in the memory post the flex
> array which its size depends on 'ncounters_data'. This pattern is used also
> in other extended APIs. [3]
> 
> struct mlx5_ib_create_flow {
> __u32   ncounters_data;
> __u32   reserved;
> [1] /* Header is above 
> 
> /* Following are counters data based on ncounters_data */
> struct mlx5_ib_flow_counters_data data[];
> 
> [2] Future fields.

We could do that.. But we won't - if it comes to it this will have to
move to the new kabi.

> [3] 
> https://elixir.bootlin.com/linux/latest/source/include/uapi/rdma/ib_user_verbs.h#L1145

?? That looks like a normal flex array to me.

Jason


Re: [PATCH v4 net-next 00/19] inet: frags: bring rhashtables to IP defrag

2018-05-30 Thread Tariq Toukan




On 30/05/2018 10:36 AM, Eric Dumazet wrote:

On Wed, May 30, 2018 at 3:20 AM Tariq Toukan  wrote:


Not sure, the transmit BW you get is higher than what we saw.
Anyway, we'll check this.


That is on a 40Gbit test bed (mlx4 cx/3), maybe you were using a 10Gbit NIC
?



It is a ConnectX-4 50G (mlx5).

Moshe is trying out the tuning you suggested.
He'll update once he's done.



Re: [PATCH net] mlx4_core: restore optimal ICM memory allocation

2018-05-30 Thread Tariq Toukan




On 30/05/2018 7:11 AM, Eric Dumazet wrote:

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.

Note that mlx4_alloc_icm() is already able to try high order allocations
and fallback to low-order allocations under high memory pressure.

We only have to tweak gfp_mask a bit, to help falling back faster,
without risking OOM killings.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr 8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G   O
Call Trace:
  [] dump_stack+0x4d/0x72
  [] print_address_description+0x6f/0x260
  [] kasan_report+0x257/0x370
  [] __asan_report_load4_noabort+0x19/0x20
  [] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
  [] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
  [] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
  [] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
  [] seq_read+0xa9c/0x1230
  [] proc_reg_read+0xc1/0x180
  [] __vfs_read+0xe8/0x730
  [] vfs_read+0xf7/0x300
  [] SyS_read+0xd2/0x1b0
  [] do_syscall_64+0x186/0x420
  [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:7ffd09a758c0 EFLAGS: 0293 ORIG_RAX: 
RAX: ffda RBX: 7f84ff959440 RCX: 7f851a7bb30d
RDX: 0003fc00 RSI: 7f84ff60a000 RDI: 000b
RBP: 7ffd09a75900 R08:  R09: 
R10: 0022 R11: 0293 R12: 
R13: 0003 R14: 0003 R15: 7f84ff60a000

Allocated by task 4488:
  save_stack+0x46/0xd0
  kasan_kmalloc+0xad/0xe0
  __kmalloc+0x101/0x5e0
  ib_register_device+0xc03/0x1250 [ib_core]
  mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
  mlx4_add_device+0xa9/0x340 [mlx4_core]
  mlx4_register_interface+0x16e/0x390 [mlx4_core]
  xhci_pci_remove+0x7a/0x180 [xhci_pci]
  do_one_initcall+0xa0/0x230
  do_init_module+0x1b9/0x5a4
  load_module+0x63e6/0x94c0
  SYSC_init_module+0x1a4/0x1c0
  SyS_init_module+0xe/0x10
  do_syscall_64+0x186/0x420
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at 8817df584f40
  which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
  32-byte region [8817df584f40, 8817df584f60)
The buggy address belongs to the page:
page:ea005f7d6100 count:1 mapcount:0 mapping:8817df584000 
index:0x8817df584fc1
flags: 0x8800100(slab)
raw: 08800100 8817df584000 8817df584fc1 0001003f
raw: ea005f3ac0a0 ea005c476760 8817fec00900 883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:883ff78d26c0

Memory state around the buggy address:
  8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
  8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc

8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc

   ^
  8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
  8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
Signed-off-by: Eric Dumazet 
Cc: John Sperbeck 
Cc: Tarick Bedeir 
Cc: Qing Huang 
Cc: Daniel Jurgens 
Cc: Zhu Yanjun 
Cc: Tariq Toukan 
---
  drivers/net/ethernet/mellanox/mlx4/icm.c | 17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 
685337d58276fc91baeeb64387c52985e1bc6dda..cae33d5c7dbd9ba7929adcf2127b104f6796fa5a
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
  #include "fw.h"
  
  /*

- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
   */
  enum {
-   MLX4_ICM_ALLOC_SIZE = PAGE_SIZE,
-   MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
+   MLX4_ICM_ALLOC_SIZE = 1 << 18,
+   MLX4_TABLE_CHUNK_SIZE   = 1 << 18,
  };
  
  static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)

@@ -135,6 +136,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int 
npages,
struct mlx4_icm *icm;
struct mlx4_icm_chunk *chunk = NULL;
int cur_order;
+   gfp_t mask;
int ret;
  
  	/* We use sg_set_buf for coherent allocs, which assumes low memory */

@@ -178,13 +180,16 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int 
npages,
while (1 << cur_order > npages)
--cur_order;
  
+		mask = gfp_mask;

+   

Re: [PATCH bpf v3 0/5] fix test_sockmap

2018-05-30 Thread John Fastabend
On 05/29/2018 10:56 PM, Prashant Bhole wrote:
> This series fixes error handling, timeout and data verification in
> test_sockmap. Previously it was not able to detect failure/timeout in
> RX/TX thread because error was not notified to the main thread.
> 
> Also slightly improved test output by printing parameter values (cork,
> apply, start, end) so that parameters for all tests are displayed.
> 
> Changes in v3:
>   - Skipped error checking for corked tests
> 
> Prashant Bhole (5):
>   selftests/bpf: test_sockmap, check test failure
>   selftests/bpf: test_sockmap, join cgroup in selftest mode
>   selftests/bpf: test_sockmap, fix test timeout
>   selftests/bpf: test_sockmap, fix data verification
>   selftests/bpf: test_sockmap, print additional test options
> 
>  tools/testing/selftests/bpf/test_sockmap.c | 76 +-
>  1 file changed, 58 insertions(+), 18 deletions(-)
> 

Looks good thanks. We may want to tune the running time a bit but
lets get this applied first. A lot of nice improvements!

.John


Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout

2018-05-30 Thread John Fastabend
On 05/29/2018 10:56 PM, Prashant Bhole wrote:
> In order to reduce runtime of tests, recently timout for select() call
> was reduced from 1sec to 10usec. This was causing many tests failures.
> It was caught with failure handling commits in this series.
> 
> Restoring the timeout from 10usec to 1sec
> 
> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
> Signed-off-by: Prashant Bhole 
> ---

Quick question, how long does it take to run now with the time increase?
If its taking too long we may need to remove some tests. I have a longer
running test_sockmap script that I run as part of Cilium[1] project
where I put longer running stress tests.

Acked-by: John Fastabend 

[1] cilium.io


Re: [PATCH bpf v3 1/5] selftests/bpf: test_sockmap, check test failure

2018-05-30 Thread John Fastabend
On 05/29/2018 10:56 PM, Prashant Bhole wrote:
> Test failures are not identified because exit code of RX/TX threads
> is not checked. Also threads are not returning correct exit code.
> 
> - Return exit code from threads depending on test execution status
> - In main thread, check the exit code of RX/TX threads
> - Skip error checking for corked tests as they are expected to timeout
> 
> Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
> Signed-off-by: Prashant Bhole 
> ---
>  tools/testing/selftests/bpf/test_sockmap.c | 25 --
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 

Looks good. Thanks.

Acked-by: John Fastabend 


[PATCH net-next] qed: Add srq core support for RoCE and iWARP

2018-05-30 Thread Yuval Bason
This patch adds support for configuring SRQ and provides the necessary
APIs for rdma upper layer driver (qedr) to enable the SRQ feature.

Signed-off-by: Michal Kalderon 
Signed-off-by: Ariel Elior 
Signed-off-by: Yuval Bason 
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c   |   5 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.h   |   1 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h   |   2 +
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c |  23 
 drivers/net/ethernet/qlogic/qed/qed_main.c  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c  | 179 +++-
 drivers/net/ethernet/qlogic/qed/qed_rdma.h  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c  |  17 ++-
 include/linux/qed/qed_rdma_if.h |  12 +-
 9 files changed, 235 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c 
b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 820b226..7ed6aa0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -47,6 +47,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_init_ops.h"
+#include "qed_rdma.h"
 #include "qed_reg_addr.h"
 #include "qed_sriov.h"
 
@@ -426,7 +427,7 @@ static void qed_cxt_set_srq_count(struct qed_hwfn *p_hwfn, 
u32 num_srqs)
p_mgr->srq_count = num_srqs;
 }
 
-static u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
 {
struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
 
@@ -2071,7 +2072,7 @@ static void qed_rdma_set_pf_params(struct qed_hwfn 
*p_hwfn,
u32 num_cons, num_qps, num_srqs;
enum protocol_type proto;
 
-   num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
+   num_srqs = min_t(u32, QED_RDMA_MAX_SRQS, p_params->num_srqs);
 
if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
DP_NOTICE(p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h 
b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index a4e9586..758a8b4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -235,6 +235,7 @@ u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn,
enum protocol_type type);
 u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn,
enum protocol_type type);
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn);
 int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto);
 
 #define QED_CTX_WORKING_MEM 0
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h 
b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 8e1e6e1..82ce401 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -9725,6 +9725,8 @@ enum iwarp_eqe_async_opcode {
IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED,
IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE,
IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW,
+   IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY,
+   IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT,
MAX_IWARP_EQE_ASYNC_OPCODE
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c 
b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 2a2b101..474e6cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -271,6 +271,8 @@ int qed_iwarp_create_qp(struct qed_hwfn *p_hwfn,
p_ramrod->sq_num_pages = qp->sq_num_pages;
p_ramrod->rq_num_pages = qp->rq_num_pages;
 
+   p_ramrod->srq_id.srq_idx = cpu_to_le16(qp->srq_id);
+   p_ramrod->srq_id.opaque_fid = cpu_to_le16(p_hwfn->hw_info.opaque_fid);
p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
 
@@ -3004,8 +3006,11 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
 union event_ring_data *data,
 u8 fw_return_code)
 {
+   struct qed_rdma_events events = p_hwfn->p_rdma_info->events;
struct regpair *fw_handle = >rdma_data.async_handle;
struct qed_iwarp_ep *ep = NULL;
+   u16 srq_offset;
+   u16 srq_id;
u16 cid;
 
ep = (struct qed_iwarp_ep *)(uintptr_t)HILO_64(fw_handle->hi,
@@ -3067,6 +3072,24 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
qed_iwarp_cid_cleaned(p_hwfn, cid);
 
break;
+   case IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY:
+   DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY\n");
+   srq_offset = p_hwfn->p_rdma_info->srq_id_offset;
+   /* FW assigns value that is no greater than u16 */
+   srq_id = ((u16)le32_to_cpu(fw_handle->lo)) - srq_offset;
+   events.affiliated_event(events.context,
+   QED_IWARP_EVENT_SRQ_EMPTY,
+   _id);
+   break;
+   case 

Re: [PATCH] rtnetlink: Add more well known protocol values

2018-05-30 Thread Donald Sharp
This patch is intended for net-next.

thanks!

donald

On Wed, May 30, 2018 at 8:27 AM, Donald Sharp
 wrote:
> FRRouting installs routes into the kernel associated with
> the originating protocol.  Add these values to the well
> known values in rtnetlink.h.
>
> Signed-off-by: Donald Sharp 
> ---
> v2: Fixed whitespace issues
>  include/uapi/linux/rtnetlink.h | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index cabb210c93af..7d8502313c99 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -254,6 +254,11 @@ enum {
>  #define RTPROT_DHCP16  /* DHCP client */
>  #define RTPROT_MROUTED 17  /* Multicast daemon */
>  #define RTPROT_BABEL   42  /* Babel daemon */
> +#define RTPROT_BGP 186 /* BGP Routes */
> +#define RTPROT_ISIS187 /* ISIS Routes */
> +#define RTPROT_OSPF188 /* OSPF Routes */
> +#define RTPROT_RIP 189 /* RIP Routes */
> +#define RTPROT_EIGRP   192 /* EIGRP Routes */
>
>  /* rtm_scope
>
> --
> 2.14.3
>


Re: [PATCH mlx5-next v2 11/13] IB/mlx5: Add flow counters binding support

2018-05-30 Thread Yishai Hadas

On 5/29/2018 10:56 PM, Jason Gunthorpe wrote:

On Tue, May 29, 2018 at 04:09:15PM +0300, Leon Romanovsky wrote:

diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 508ea8c82da7..ef3f430a7050 100644
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -443,4 +443,18 @@ enum {
  enum {
MLX5_IB_CLOCK_INFO_V1  = 0,
  };
+
+struct mlx5_ib_flow_counters_data {
+   __aligned_u64   counters_data;
+   __u32   ncounters;
+   __u32   reserved;
+};
+
+struct mlx5_ib_create_flow {
+   __u32   ncounters_data;
+   __u32   reserved;
+   /* Following are counters data based on ncounters_data */
+   struct mlx5_ib_flow_counters_data data[];
+};
+
  #endif /* MLX5_ABI_USER_H */


This uapi thing still needs to be fixed as I pointed out before.


In V3 we can go with below, no change in memory layout but it can 
clarify the code/usage.


struct mlx5_ib_flow_counters_desc {
__u32   description;
__u32   index;
};

struct mlx5_ib_flow_counters_data {
RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data);
__u32   ncounters;
__u32   reserved;
};

struct mlx5_ib_create_flow {
__u32   ncounters_data;
__u32   reserved;
/* Following are counters data based on ncounters_data */
struct mlx5_ib_flow_counters_data data[];



I still can't figure out why this should be a 2d array.


This comes to support the future case of multiple counters objects/specs 
passed with the same flow. There is a need to differentiate mapping data 
for each counters object and that is done via the 'ncounters_data' field 
and the 2d array.


 I think it

should be written simply as:

struct mlx5_ib_flow_counter_desc {
 __u32 description;
 __u32 index;
};

struct mlx5_ib_create_flow {
RDMA_UAPI_PTR(struct mlx5_ib_flow_counter_desc, counters_data);
__u32   ncounters;
__u32   reserved;
};

With the corresponding changes elsewhere.



This doesn't support the above use case.


A flex array at the end of a struct means that the struct can never be
extended again which seems like a terrible idea,


The header [1] has a fixed size and will always exist even if there will 
be no counters. Future extensions [2] will be added in the memory post 
the flex array which its size depends on 'ncounters_data'. This pattern 
is used also in other extended APIs. [3]


struct mlx5_ib_create_flow {
__u32   ncounters_data;
__u32   reserved;
[1] /* Header is above 

/* Following are counters data based on ncounters_data */
struct mlx5_ib_flow_counters_data data[];

[2] Future fields.

[3] 
https://elixir.bootlin.com/linux/latest/source/include/uapi/rdma/ib_user_verbs.h#L1145









[PATCH] rtnetlink: Add more well known protocol values

2018-05-30 Thread Donald Sharp
FRRouting installs routes into the kernel associated with
the originating protocol.  Add these values to the well
known values in rtnetlink.h.

Signed-off-by: Donald Sharp 
---
v2: Fixed whitespace issues
 include/uapi/linux/rtnetlink.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cabb210c93af..7d8502313c99 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -254,6 +254,11 @@ enum {
 #define RTPROT_DHCP16  /* DHCP client */
 #define RTPROT_MROUTED 17  /* Multicast daemon */
 #define RTPROT_BABEL   42  /* Babel daemon */
+#define RTPROT_BGP 186 /* BGP Routes */
+#define RTPROT_ISIS187 /* ISIS Routes */
+#define RTPROT_OSPF188 /* OSPF Routes */
+#define RTPROT_RIP 189 /* RIP Routes */
+#define RTPROT_EIGRP   192 /* EIGRP Routes */
 
 /* rtm_scope
 
-- 
2.14.3



[PATCH net-next] cxgb4: Add FORCE_PAUSE bit to 32 bit port caps

2018-05-30 Thread Ganesh Goudar
Add FORCE_PAUSE bit to force local pause settings instead
of using auto negotiated values.

Signed-off-by: Santosh Rastapur 
Signed-off-by: Casey Leedom 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c| 10 +-
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |  5 +++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 39da7e3..974a868 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -3941,6 +3941,7 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t 
caps16)
CAP16_TO_CAP32(FC_RX);
CAP16_TO_CAP32(FC_TX);
CAP16_TO_CAP32(ANEG);
+   CAP16_TO_CAP32(FORCE_PAUSE);
CAP16_TO_CAP32(MDIAUTO);
CAP16_TO_CAP32(MDISTRAIGHT);
CAP16_TO_CAP32(FEC_RS);
@@ -3982,6 +3983,7 @@ static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t 
caps32)
CAP32_TO_CAP16(802_3_PAUSE);
CAP32_TO_CAP16(802_3_ASM_DIR);
CAP32_TO_CAP16(ANEG);
+   CAP32_TO_CAP16(FORCE_PAUSE);
CAP32_TO_CAP16(MDIAUTO);
CAP32_TO_CAP16(MDISTRAIGHT);
CAP32_TO_CAP16(FEC_RS);
@@ -4014,6 +4016,8 @@ static inline fw_port_cap32_t cc_to_fwcap_pause(enum 
cc_pause cc_pause)
fw_pause |= FW_PORT_CAP32_FC_RX;
if (cc_pause & PAUSE_TX)
fw_pause |= FW_PORT_CAP32_FC_TX;
+   if (!(cc_pause & PAUSE_AUTONEG))
+   fw_pause |= FW_PORT_CAP32_FORCE_PAUSE;
 
return fw_pause;
 }
@@ -4101,7 +4105,11 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned 
int mbox,
rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
}
 
-   if (rcap & ~lc->pcaps) {
+   /* Note that older Firmware doesn't have FW_PORT_CAP32_FORCE_PAUSE, so
+* we need to exclude this from this check in order to maintain
+* compatibility ...
+*/
+   if ((rcap & ~lc->pcaps) & ~FW_PORT_CAP32_FORCE_PAUSE) {
dev_err(adapter->pdev_dev,
"Requested Port Capabilities %#x exceed Physical Port 
Capabilities %#x\n",
rcap, lc->pcaps);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h 
b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 2d91480..f1967cf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2475,7 +2475,7 @@ enum fw_port_cap {
FW_PORT_CAP_MDISTRAIGHT = 0x0400,
FW_PORT_CAP_FEC_RS  = 0x0800,
FW_PORT_CAP_FEC_BASER_RS= 0x1000,
-   FW_PORT_CAP_FEC_RESERVED= 0x2000,
+   FW_PORT_CAP_FORCE_PAUSE = 0x2000,
FW_PORT_CAP_802_3_PAUSE = 0x4000,
FW_PORT_CAP_802_3_ASM_DIR   = 0x8000,
 };
@@ -2522,7 +2522,8 @@ enum fw_port_mdi {
 #defineFW_PORT_CAP32_FEC_RESERVED1 0x0200UL
 #defineFW_PORT_CAP32_FEC_RESERVED2 0x0400UL
 #defineFW_PORT_CAP32_FEC_RESERVED3 0x0800UL
-#defineFW_PORT_CAP32_RESERVED2 0xf000UL
+#define FW_PORT_CAP32_FORCE_PAUSE  0x1000UL
+#define FW_PORT_CAP32_RESERVED20xe000UL
 
 #define FW_PORT_CAP32_SPEED_S  0
 #define FW_PORT_CAP32_SPEED_M  0xfff
-- 
2.1.0



Re: [PATCH bpf-next v7 3/6] bpf: Add IPv6 Segment Routing helpers

2018-05-30 Thread Daniel Borkmann
On 05/24/2018 12:18 PM, Daniel Borkmann wrote:
> On 05/20/2018 03:58 PM, Mathieu Xhonneux wrote:
>> The BPF seg6local hook should be powerful enough to enable users to
>> implement most of the use-cases one could think of. After some thinking,
>> we figured out that the following actions should be possible on a SRv6
>> packet, requiring 3 specific helpers :
>> - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH
>> - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH
>>(to add/delete TLVs)
>> - bpf_lwt_seg6_action: Apply some SRv6 network programming actions
>>(specifically End.X, End.T, End.B6 and
>> End.B6.Encap)
>>
>> The specifications of these helpers are provided in the patch (see
>> include/uapi/linux/bpf.h).
>>
>> The non-sensitive fields of the SRH are the following : flags, tag and
>> TLVs. The other fields can not be modified, to maintain the SRH
>> integrity. Flags, tag and TLVs can easily be modified as their validity
>> can be checked afterwards via seg6_validate_srh. It is not allowed to
>> modify the segments directly. If one wants to add segments on the path,
>> he should stack a new SRH using the End.B6 action via
>> bpf_lwt_seg6_action.
>>
>> Growing, shrinking or editing TLVs via the helpers will flag the SRH as
>> invalid, and it will have to be re-validated before re-entering the IPv6
>> layer. This flag is stored in a per-CPU buffer, along with the current
>> header length in bytes.
>>
>> Storing the SRH len in bytes in the control block is mandatory when using
>> bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH
>> len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes
>> boundary). When adding/deleting TLVs within the BPF program, the SRH may
>> temporary be in an invalid state where its length cannot be rounded to 8
>> bytes without remainder, hence the need to store the length in bytes
>> separately. The caller of the BPF program can then ensure that the SRH's
>> final length is valid using this value. Again, a final SRH modified by a
>> BPF program which doesn’t respect the 8-bytes boundary will be discarded
>> as it will be considered as invalid.
>>
>> Finally, a fourth helper is provided, bpf_lwt_push_encap, which is
>> available from the LWT BPF IN hook, but not from the seg6local BPF one.
>> This helper allows to encapsulate a Segment Routing Header (either with
>> a new outer IPv6 header, or by inlining it directly in the existing IPv6
>> header) into a non-SRv6 packet. This helper is required if we want to
>> offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet,
>> as the BPF seg6local hook only works on traffic already containing a SRH.
>> This is the BPF equivalent of the seg6 LWT infrastructure, which achieves
>> the same purpose but with a static SRH per route.
>>
>> These helpers require CONFIG_IPV6=y (and not =m).
>>
>> Signed-off-by: Mathieu Xhonneux 
>> Acked-by: David Lebrun 
> 
> One minor comments for follow-ups in here below.
> 
>> +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
>> +   const void *, from, u32, len)
>> +{
>> +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
>> +struct seg6_bpf_srh_state *srh_state =
>> +this_cpu_ptr(_bpf_srh_states);
>> +void *srh_tlvs, *srh_end, *ptr;
>> +struct ipv6_sr_hdr *srh;
>> +int srhoff = 0;
>> +
>> +if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
>> +return -EINVAL;
>> +
>> +srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>> +srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
>> +srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
>> +
>> +ptr = skb->data + offset;
>> +if (ptr >= srh_tlvs && ptr + len <= srh_end)
>> +srh_state->valid = 0;
>> +else if (ptr < (void *)>flags ||
>> + ptr + len > (void *)>segments)
>> +return -EFAULT;
>> +
>> +if (unlikely(bpf_try_make_writable(skb, offset + len)))
>> +return -EFAULT;
>> +
>> +memcpy(skb->data + offset, from, len);
>> +return 0;
>> +#else /* CONFIG_IPV6_SEG6_BPF */
>> +return -EOPNOTSUPP;
>> +#endif
>> +}
> 
> Instead of doing this inside the helper you can reject the program already
> in the lwt_*_func_proto() by returning NULL when !CONFIG_IPV6_SEG6_BPF. That
> way programs get rejected at verification time instead of runtime, so the
> user can probe availability more easily.

Mathieu, before this gets lost in archives, plan to follow-up on this one?


Re: [PATCH v4 net-next 00/19] inet: frags: bring rhashtables to IP defrag

2018-05-30 Thread Eric Dumazet
On Wed, May 30, 2018 at 6:36 AM Eric Dumazet  wrote:


> Here are the good ones, using latest David Miller net tree. ( plus
> https://patchwork.ozlabs.org/patch/922528/  but that should not matter
here)

> llpaa23:/export/hda3/google/edumazet# ./netperf -H 2607:f8b0:8099:e18:: -t
> UDP_STREAM
> MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to
> 2607:f8b0:8099:e18:: () port 0 AF_INET6
> Socket  Message  Elapsed  Messages
> SizeSize Time Okay Errors   Throughput
> bytes   bytessecs#  #   10^6bits/sec

> 212992   65507   10.00  216236  011331.89
> 212992   10.00  215068   11270.68


> There are few drops because of the too small
> /proc/sys/net/core/rmem_default  ( 212992 as seen in netperf output) for
> these kind of stress.
> ( each 64KB datagram actually consumes half the budget ...)


Once rmem_default is set to 1,000,000 and  mtu set back to 1500 (instead of
5102 on my testbed)
results are indeed better.

lpaa23:/export/hda3/google/edumazet# ./netperf -H 2607:f8b0:8099:e18:: -t
UDP_STREAM -l 10
MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to
2607:f8b0:8099:e18:: () port 0 AF_INET6
Socket  Message  Elapsed  Messages
SizeSize Time Okay Errors   Throughput
bytes   bytessecs#  #   10^6bits/sec

212992   65507   10.00  231457  012129.56
100   10.00  231457   12129.56


Re: [PATCH bpf-next] bpftool: Support sendmsg{4,6} attach types

2018-05-30 Thread Daniel Borkmann
On 05/30/2018 02:12 AM, Song Liu wrote:
> On Tue, May 29, 2018 at 2:20 PM, Jakub Kicinski  wrote:
>> On Tue, 29 May 2018 13:29:31 -0700, Andrey Ignatov wrote:
>>> Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
>>> BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
>>> and bash completion.
>>>
>>> Signed-off-by: Andrey Ignatov 
>>
>> Reviewed-by: Jakub Kicinski 
>>
>>> I'm not sure about "since 4.18" in Documentation part. I can follow-up when
>>> the next kernel version is known.
>>
>> IMHO it's fine, we can follow up if Linus decides to call it something
>> else :)
>>
>> Thanks!
> 
> Acked-by: Song Liu 

Applied to bpf-next, thanks guys!


Re: [PATCH v4 net-next 00/19] inet: frags: bring rhashtables to IP defrag

2018-05-30 Thread Eric Dumazet
On Wed, May 30, 2018 at 5:20 AM Jesper Dangaard Brouer 
wrote:

> On Mon, 28 May 2018 09:09:17 -0700
> Eric Dumazet  wrote:

> > Tariq, here are my test results : No drops for me.
> >
> > # ./netperf -H 2607:f8b0:8099:e18:: -t UDP_STREAM
> > MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to
2607:f8b0:8099:e18:: () port 0 AF_INET6
> > Socket  Message  Elapsed  Messages
> > SizeSize Time Okay Errors   Throughput
> > bytes   bytessecs#  #   10^6bits/sec
> >
> > 212992   65507   10.00  202117  010592.00
> > 212992   10.00   0  0.00

> Hmm... Eric the above result show that ALL your UDP packets were dropped!
> You have 0 okay messages and 0.00 Mbit/s throughput.

> It needs to look like below (test on i40e NIC):

> $ netperf -t UDP_STREAM -H fee0:cafe::1
> MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to fee0:cafe::1 ()
port 0 AF_INET6 : histogram : demo
> Socket  Message  Elapsed  Messages
> SizeSize Time Okay Errors   Throughput
> bytes   bytessecs#  #   10^6bits/sec

> 212992   65507   10.00  186385  09767.08
> 212992   10.00  186385   9767.08


> If I manually instruct ip6tables to drop all UDP packets, then I get
> what you see... so, something on your test system are likely dropping
> your UDP packets, but letting regular netperf (TCP) control
> communication through.

> # ip6tables -I INPUT -p udp -j DROP

> $ netperf -t UDP_STREAM -H fee0:cafe::1
> MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to fee0:cafe::1 ()
port 0 AF_INET6 : histogram : demo
> Socket  Message  Elapsed  Messages
> SizeSize Time Okay Errors   Throughput
> bytes   bytessecs#  #   10^6bits/sec

> 212992   65507   10.00  182095  09542.41
> 212992   10.00   0  0.00



Right you are, for some reason I copied/pasted wrong results,
after _specifically_ filling up the frags to the memory limits,
when trying to reproduce 'bad numbers '

Here are the good ones, using latest David Miller net tree. ( plus
https://patchwork.ozlabs.org/patch/922528/  but that should not matter here)

llpaa23:/export/hda3/google/edumazet# ./netperf -H 2607:f8b0:8099:e18:: -t
UDP_STREAM
MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to
2607:f8b0:8099:e18:: () port 0 AF_INET6
Socket  Message  Elapsed  Messages
SizeSize Time Okay Errors   Throughput
bytes   bytessecs#  #   10^6bits/sec

212992   65507   10.00  216236  011331.89
212992   10.00  215068   11270.68


There are few drops because of the too small
/proc/sys/net/core/rmem_default  ( 212992 as seen in netperf output) for
these kind of stress.
( each 64KB datagram actually consumes half the budget ...)


Feature Request : iface may be allowed as datatype in all ipset

2018-05-30 Thread Akshat Kakkar
Is there a reason why iface is allowed to be paired only with net to
create an ipset?

I think with feature of skbinfo in every ipset, it should be allowed
to add iface in all ipset. As skbinfo can store tc classes, it might
make more sense if I can pin point on which outgoing interface this
class should be applied.

One direct way of doing could be adding iface in skbinfo itself, but I
dont think its a good suggestion.

So, other thing left is to have ipset storing interface too. Besides,
when I create a tc class, I create it on a known interface, so I know
beforehand on which interface this class is created. So I can easily
specify while adding entry in ipset.


Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress

2018-05-30 Thread John Hurley
On Tue, May 29, 2018 at 11:09 PM, Jiri Pirko  wrote:
> Tue, May 29, 2018 at 04:08:48PM CEST, john.hur...@netronome.com wrote:
>>On Sat, May 26, 2018 at 3:47 AM, Jakub Kicinski
>> wrote:
>>> On Fri, 25 May 2018 08:48:09 +0200, Jiri Pirko wrote:
 Thu, May 24, 2018 at 04:22:47AM CEST, jakub.kicin...@netronome.com wrote:
 >Hi!
 >
 >This series from John adds bond offload to the nfp driver.  Patch 5
 >exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
 >hashing matches that of the software LAG.  This may be unnecessarily
 >conservative, let's see what LAG maintainers think :)

 So you need to restrict offload to only certain hash algo? In mlxsw, we
 just ignore the lag setting and do some hw default hashing. Would not be
 enough? Note that there's a good reason for it, as you see, in team, the
 hashing is done in a BPF function and could be totally arbitrary.
 Your patchset effectively disables team offload for nfp.
>>>
>>> My understanding is that the project requirements only called for L3/L4
>>> hash algorithm offload, hence the temptation to err on the side of
>>> caution and not offload all the bond configurations.  John can provide
>>> more details.  Not being able to offload team is unfortunate indeed.
>>
>>Hi Jiri,
>>Yes, as Jakub mentions, we restrict ourselves to L3/L4 hash algorithm
>>as this is currently what is supported in fw.
>
> In mlxsw, a default l3/l4 is used always, no matter what the
> bonding/team sets. It is not correct, but it works with team as well.
> Perhaps we can have NETDEV_LAG_HASH_UNKNOWN to indicate to the driver to
> do some default? That would make the "team" offload functional.
>

yes, I would agree with that.
Thanks

>>Hopefully this will change as fw features are expanded.
>>I understand the issue this presents with offloading team.
>>Perhaps resorting to a default hw hash for team is acceptable.
>>John


Re: [PATCH v4 net-next 00/19] inet: frags: bring rhashtables to IP defrag

2018-05-30 Thread Jesper Dangaard Brouer
On Mon, 28 May 2018 09:09:17 -0700
Eric Dumazet  wrote:

> Tariq, here are my test results : No drops for me.
> 
> # ./netperf -H 2607:f8b0:8099:e18:: -t UDP_STREAM
> MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to 
> 2607:f8b0:8099:e18:: () port 0 AF_INET6
> Socket  Message  Elapsed  Messages
> SizeSize Time Okay Errors   Throughput
> bytes   bytessecs#  #   10^6bits/sec
> 
> 212992   65507   10.00  202117  010592.00
> 212992   10.00   0  0.00

Hmm... Eric the above result show that ALL your UDP packets were dropped!
You have 0 okay messages and 0.00 Mbit/s throughput.

It needs to look like below (test on i40e NIC):

$ netperf -t UDP_STREAM -H fee0:cafe::1
MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to fee0:cafe::1 () port 
0 AF_INET6 : histogram : demo
Socket  Message  Elapsed  Messages
SizeSize Time Okay Errors   Throughput
bytes   bytessecs#  #   10^6bits/sec

212992   65507   10.00  186385  09767.08
212992   10.00  186385   9767.08


If I manually instruct ip6tables to drop all UDP packets, then I get
what you see... so, something on your test system are likely dropping
your UDP packets, but letting regular netperf (TCP) control
communication through.

# ip6tables -I INPUT -p udp -j DROP

$ netperf -t UDP_STREAM -H fee0:cafe::1
MIGRATED UDP STREAM TEST from ::0 (::) port 0 AF_INET6 to fee0:cafe::1 () port 
0 AF_INET6 : histogram : demo
Socket  Message  Elapsed  Messages
SizeSize Time Okay Errors   Throughput
bytes   bytessecs#  #   10^6bits/sec

212992   65507   10.00  182095  09542.41
212992   10.00   0  0.00


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH net] VSOCK: check sk state before receive

2018-05-30 Thread Stefan Hajnoczi
On Sun, May 27, 2018 at 11:29:45PM +0800, Hangbin Liu wrote:
> Hmm...Although I won't reproduce this bug with my reproducer after
> apply my patch. I could still get a similiar issue with syzkaller sock vnet 
> test.
> 
> It looks this patch is not complete. Here is the KASAN call trace with my 
> patch.
> I can also reproduce it without my patch.

Seems like a race between vmci_datagram_destroy_handle() and the
delayed callback, vmci_transport_recv_dgram_cb().

I don't know the VMCI transport well so I'll leave this to Jorgen.

> ==
> BUG: KASAN: use-after-free in vmci_transport_allow_dgram.part.7+0x155/0x1a0 
> [vmw_vsock_vmci_transport]
> Read of size 4 at addr 880026a3a914 by task kworker/0:2/96
> 
> CPU: 0 PID: 96 Comm: kworker/0:2 Not tainted 4.17.0-rc6.vsock+ #28
> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> Workqueue: events dg_delayed_dispatch [vmw_vmci]
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0xdd/0x18e lib/dump_stack.c:113
>  print_address_description+0x7a/0x3e0 mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report+0x1dd/0x460 mm/kasan/report.c:412
>  vmci_transport_allow_dgram.part.7+0x155/0x1a0 [vmw_vsock_vmci_transport]
>  vmci_transport_recv_dgram_cb+0x5d/0x200 [vmw_vsock_vmci_transport]
>  dg_delayed_dispatch+0x99/0x1b0 [vmw_vmci]
>  process_one_work+0xa4e/0x1720 kernel/workqueue.c:2145
>  worker_thread+0x1df/0x1400 kernel/workqueue.c:2279
>  kthread+0x343/0x4b0 kernel/kthread.c:240
>  ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:412
> 
> Allocated by task 2684:
>  set_track mm/kasan/kasan.c:460 [inline]
>  kasan_kmalloc+0xa0/0xd0 mm/kasan/kasan.c:553
>  slab_post_alloc_hook mm/slab.h:444 [inline]
>  slab_alloc_node mm/slub.c:2741 [inline]
>  slab_alloc mm/slub.c:2749 [inline]
>  kmem_cache_alloc+0x105/0x330 mm/slub.c:2754
>  sk_prot_alloc+0x6a/0x2c0 net/core/sock.c:1468
>  sk_alloc+0xc9/0xbb0 net/core/sock.c:1528
>  __vsock_create+0xc8/0x9b0 [vsock]
>  vsock_create+0xfd/0x1a0 [vsock]
>  __sock_create+0x310/0x690 net/socket.c:1285
>  sock_create net/socket.c:1325 [inline]
>  __sys_socket+0x101/0x240 net/socket.c:1355
>  __do_sys_socket net/socket.c:1364 [inline]
>  __se_sys_socket net/socket.c:1362 [inline]
>  __x64_sys_socket+0x7d/0xd0 net/socket.c:1362
>  do_syscall_64+0x175/0x630 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Freed by task 2684:
>  set_track mm/kasan/kasan.c:460 [inline]
>  __kasan_slab_free+0x130/0x180 mm/kasan/kasan.c:521
>  slab_free_hook mm/slub.c:1388 [inline]
>  slab_free_freelist_hook mm/slub.c:1415 [inline]
>  slab_free mm/slub.c:2988 [inline]
>  kmem_cache_free+0xce/0x410 mm/slub.c:3004
>  sk_prot_free net/core/sock.c:1509 [inline]
>  __sk_destruct+0x629/0x940 net/core/sock.c:1593
>  sk_destruct+0x4e/0x90 net/core/sock.c:1601
>  __sk_free+0xd3/0x320 net/core/sock.c:1612
>  sk_free+0x2a/0x30 net/core/sock.c:1623
>  __vsock_release+0x431/0x610 [vsock]
>  vsock_release+0x3c/0xc0 [vsock]
>  sock_release+0x91/0x200 net/socket.c:594
>  sock_close+0x17/0x20 net/socket.c:1149
>  __fput+0x368/0xa20 fs/file_table.c:209
>  task_work_run+0x1c5/0x2a0 kernel/task_work.c:113
>  exit_task_work include/linux/task_work.h:22 [inline]
>  do_exit+0x1876/0x26c0 kernel/exit.c:865
>  do_group_exit+0x159/0x3e0 kernel/exit.c:968
>  get_signal+0x65a/0x1780 kernel/signal.c:2482
>  do_signal+0xa4/0x1fe0 arch/x86/kernel/signal.c:810
>  exit_to_usermode_loop+0x1b8/0x260 arch/x86/entry/common.c:162
>  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
>  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
>  do_syscall_64+0x505/0x630 arch/x86/entry/common.c:290
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> The buggy address belongs to the object at 880026a3a600
>  which belongs to the cache AF_VSOCK of size 1056
> The buggy address is located 788 bytes inside of
>  1056-byte region [880026a3a600, 880026a3aa20)
> The buggy address belongs to the page:
> page:ea9a8e00 count:1 mapcount:0 mapping: index:0x0 
> compound_mapcount: 0
> flags: 0xfc0008100(slab|head)
> raw: 000fc0008100   0001000d000d
> raw: dead0100 dead0200 880034471a40 
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  880026a3a800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  880026a3a880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> >880026a3a900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  ^
>  880026a3a980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  880026a3aa00: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
> ==


signature.asc
Description: PGP signature


[PATCH net 1/2] ip_tunnel: restore binding to ifaces with a large mtu

2018-05-30 Thread Nicolas Dichtel
After commit f6cc9c054e77, the following conf is broken (note that the
default loopback mtu is 65536, ie IP_MAX_MTU + 1):

$ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev lo
add tunnel "gre0" failed: Invalid argument
$ ip l a type dummy
$ ip l s dummy1 up
$ ip l s dummy1 mtu 65535
$ ip tunnel add gre1 mode gre local 10.125.0.1 remote 10.125.0.2 dev dummy1
add tunnel "gre0" failed: Invalid argument

dev_set_mtu() doesn't allow to set a mtu which is too large.
First, let's cap the mtu returned by ip_tunnel_bind_dev(). Second, remove
the magic value 0xFFF8 and use IP_MAX_MTU instead.
0xFFF8 seems to be there for ages, I don't know why this value was used.

With a recent kernel, it's also possible to set a mtu > IP_MAX_MTU:
$ ip l s dummy1 mtu 66000
After that patch, it's also possible to bind an ip tunnel on that kind of
interface.

CC: Petr Machata 
CC: Ido Schimmel 
Link: 
https://git.kernel.org/pub/scm/linux/kernel/git/davem/netdev-vger-cvs.git/commit/?id=e5afd356a411a
Fixes: f6cc9c054e77 ("ip_tunnel: Emit events for post-register MTU changes")
Signed-off-by: Nicolas Dichtel 
---
 net/ipv4/ip_tunnel.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6b0e362cc99b..3b39c72a1029 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -328,7 +328,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
if (tdev) {
hlen = tdev->hard_header_len + tdev->needed_headroom;
-   mtu = tdev->mtu;
+   mtu = min(tdev->mtu, IP_MAX_MTU);
}
 
dev->needed_headroom = t_hlen + hlen;
@@ -362,7 +362,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
nt = netdev_priv(dev);
t_hlen = nt->hlen + sizeof(struct iphdr);
dev->min_mtu = ETH_MIN_MTU;
-   dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
+   dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
ip_tunnel_add(itn, nt);
return nt;
 
@@ -930,7 +930,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int 
new_mtu, bool strict)
 {
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
-   int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
+   int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
 
if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
-- 
2.15.1



  1   2   >