date:20180216

[net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-16 Thread Sargun Dhillon

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

The eBPF code loading is separated from attachment of the filter, so
a privileged user can load the filter, and pass it back to an
unprivileged user who can attach it and use it at a later time.

In order to attach the filter itself, you need to supply a flag to the
seccomp syscall indicating that a eBPF filter is being attached, as
opposed to a cBPF one. Verification occurs at program load time,
so the user should only receive errors related to attachment.

Signed-off-by: Sargun Dhillon 
---
 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 ++-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 145 +--
 7 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..8490d35e59d6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,14 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   depends on !CHECKPOINT_RESTORE
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..a7df3ba6cf25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -5,7 +5,8 @@
 #include 
 
 #define SECCOMP_FILTER_FLAG_MASK   (SECCOMP_FILTER_FLAG_TSYNC | \
-SECCOMP_FILTER_FLAG_LOG)
+SECCOMP_FILTER_FLAG_LOG | \
+SECCOMP_FILTER_FLAG_EXTENDED)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..730af6c7ec2e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,10 +16,11 @@
 #define SECCOMP_SET_MODE_FILTER1
 #define SECCOMP_GET_ACTION_AVAIL   2
 
-/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC  1
-#define SECCOMP_FILTER_FLAG_LOG2
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC  (1 << 0)
+#define SECCOMP_FILTER_FLAG_LOG(1 << 1)
+#define SECCOMP_FILTER_FLAG_EXTENDED   (1 << 2)
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 940fa408a288..f8ddc4af1135 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires that the task has
-* CAP_SYS_ADMIN in its namespace or be running with

[net-next v2 2/2] bpf: Add eBPF seccomp sample programs

2018-02-16 Thread Sargun Dhillon

This adds a sample program that uses seccomp-eBPF, called
seccomp1. It shows the simple ability to code seccomp filters
in C.

Signed-off-by: Sargun Dhillon 
---
 samples/bpf/Makefile|  5 +
 samples/bpf/bpf_load.c  |  9 +++--
 samples/bpf/seccomp1_kern.c | 43 +++
 samples/bpf/seccomp1_user.c | 45 +
 4 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..264838846f71 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += seccomp1
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,8 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+seccomp1-objs := bpf_load.o $(LIBBPF) seccomp1_user.o
+
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +147,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += seccomp1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_seccomp1 += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 69806d74fa53..856bc8b93916 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+   bool is_seccomp = strncmp(event, "seccomp", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
+   } else if (is_seccomp) {
+   prog_type = BPF_PROG_TYPE_SECCOMP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct 
bpf_insn *prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk ||
+   is_seccomp)
return 0;
 
if (is_socket || is_sockops || is_sk_skb) {
@@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb 
fixup_map)
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
-   memcmp(shname, "sk_skb", 6) == 0) {
+   memcmp(shname, "sk_skb", 6) == 0 ||
+   memcmp(shname, "seccomp", 7) == 0) {
ret = load_and_attach(shname, data->d_buf,
  data->d_size);
if (ret != 0)
diff --git a/samples/bpf/seccomp1_kern.c b/samples/bpf/seccomp1_kern.c
new file mode 100644
index ..420e37eebd92
--- /dev/null
+++ b/samples/bpf/seccomp1_kern.c
@@ -0,0 +1,43 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+#if defined(__x86_64__)
+#define ARCH   AUDIT_ARCH_X86_64
+#elif defined(__i386__)
+#define ARCH   AUDIT_ARCH_I386
+#else
+#endif
+
+#ifdef ARCH
+/* Returns EPERM when trying to close fd 999 */
+SEC("seccomp")
+int bpf_prog1(struct seccomp_data *ctx)
+{
+   /*
+* Make sure this BPF program is being run on the same architecture it
+* was compiled on.
+*/
+   if (ctx->arch != ARCH)
+   return SECCOMP_RET_ERRNO | EPERM;
+   if (ctx->nr == __NR_close && ctx->args[0] == 999)
+   return SECCOMP_RET_ERRNO | EPERM;
+
+   return SECCOMP_RET_ALLOW;
+}
+#else
+#warning Architecture not supported -- Blocking all syscalls
+SEC("seccomp")
+int

[net-next v2 0/2] eBPF Seccomp filters

2018-02-16 Thread Sargun Dhillon

This patchset enables seccomp filters to be written in eBPF. Although, this
patchset doesn't introduce much of the functionality enabled by eBPF, it lays
the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE
support in order to utilize eBPF seccomp filters, as eBPF filters cannot be
retrieved via the ptrace GET_FILTER API.

Any user can load a bpf seccomp filter program, and it can be pinned and
reused without requiring access to the bpf syscalls. A user only requires
the traditional permissions of either being cap_sys_admin, or have
no_new_privs set in order to install their rule.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.

A benchmark of this patchset is as follows for a very standard eBPF filter:

Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs with a program per syscall,
and tail call, the numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

The speed of the traditional seccomp filter increases O(n) with the number
of syscalls with discrete rulesets, whereas ebpf is O(1), given any number
of syscall filters.

Changes since v1:
  * Use a flag to indicate loading an eBPF filter, not a separate command
  * Remove printk helper
  * Remove ptrace patch / restore filter / sample
  * Add some safe helpers

Sargun Dhillon (2):
  bpf, seccomp: Add eBPF filter capabilities
  bpf: Add eBPF seccomp sample programs

 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 ++-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 145 +--
 samples/bpf/Makefile |   5 ++
 samples/bpf/bpf_load.c   |   9 ++-
 samples/bpf/seccomp1_kern.c  |  43 +
 samples/bpf/seccomp1_user.c  |  45 ++
 11 files changed, 247 insertions(+), 24 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c

-- 
2.14.1

[PATCH net] xfrm: do not call rcu_read_unlock when afinfo is NULL in xfrm_get_tos

2018-02-16 Thread Xin Long

When xfrm_policy_get_afinfo returns NULL, it will not hold rcu
read lock. In this case, rcu_read_unlock should not be called
in xfrm_get_tos, just like other places where it's calling
xfrm_policy_get_afinfo.

Fixes: f5e2bb4f5b22 ("xfrm: policy: xfrm_get_tos cannot fail")
Signed-off-by: Xin Long 
---
 net/xfrm/xfrm_policy.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078..dd4041f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1458,10 +1458,13 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, 
const struct flowi *fl,
 static int xfrm_get_tos(const struct flowi *fl, int family)
 {
const struct xfrm_policy_afinfo *afinfo;
-   int tos = 0;
+   int tos;
 
afinfo = xfrm_policy_get_afinfo(family);
-   tos = afinfo ? afinfo->get_tos(fl) : 0;
+   if (!afinfo)
+   return 0;
+
+   tos = afinfo->get_tos(fl);
 
rcu_read_unlock();
 
-- 
2.1.0

Re: [PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities

2018-02-16 Thread Sargun Dhillon

On Tue, Feb 13, 2018 at 12:34 PM, Kees Cook  wrote:
> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon  wrote:
>> From: Sargun Dhillon 
>>
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> It also introduces a new mechanism to attach these filters via the
>> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
>> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>>
>> Signed-off-by: Sargun Dhillon 
>> ---
>>  arch/Kconfig |   7 ++
>>  include/linux/bpf_types.h|   3 +
>>  include/uapi/linux/bpf.h |   2 +
>>  include/uapi/linux/seccomp.h |  15 +++--
>>  kernel/bpf/syscall.c |   1 +
>>  kernel/seccomp.c | 148 
>> +--
>>  6 files changed, 150 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 76c0b54443b1..db675888577c 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>>
>>   See Documentation/prctl/seccomp_filter.txt for details.
>>
>> +config SECCOMP_FILTER_EXTENDED
>> +   bool "Extended BPF seccomp filters"
>> +   depends on SECCOMP_FILTER && BPF_SYSCALL
>> +   help
>> + Enables seccomp filters to be written in eBPF, as opposed
>> + to just cBPF filters.
>> +
>>  config HAVE_GCC_PLUGINS
>> bool
>> help
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index 19b8349a3809..945c65c4e461 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>>  #ifdef CONFIG_CGROUP_BPF
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>>  #endif
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
>> +#endif
>>
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index db6bdc375126..5f96cb7ed954 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1,3 +1,4 @@
>> +
>>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>>   *
>> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>> BPF_PROG_TYPE_SOCK_OPS,
>> BPF_PROG_TYPE_SK_SKB,
>> BPF_PROG_TYPE_CGROUP_DEVICE,
>> +   BPF_PROG_TYPE_SECCOMP,
>>  };
>>
>>  enum bpf_attach_type {
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index 2a0bd9dd104d..7da8b39f2a6a 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -7,14 +7,17 @@
>>
>>
>>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, ) */
>> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> -#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>> -#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> +#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>> +#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */
>
> This MODE flag isn't needed: it's still using a filter, and the
> interface changes should be sufficient with
> SECCOMP_SET_MODE_FILTER_EXTENDED below.
>
>>  /* Valid operations for seccomp syscall. */
>> -#define SECCOMP_SET_MODE_STRICT0
>> -#define SECCOMP_SET_MODE_FILTER1
>> -#define SECCOMP_GET_ACTION_AVAIL   2
>> +#define SECCOMP_SET_MODE_STRICT0
>> +#define SECCOMP_SET_MODE_FILTER1
>> +#define SECCOMP_GET_ACTION_AVAIL   2
>> +#define SECCOMP_SET_MODE_FILTER_EXTENDED   3
>
> It seems like this should be a FILTER flag, not a syscall op change?
>
>> +
>>
>>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>>  #define SECCOMP_FILTER_FLAG_TSYNC  1
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e24aa3241387..86d6ec8b916d 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>>
>> if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
>> type != BPF_PROG_TYPE_CGROUP_SKB &&
>> +   type != BPF_PROG_TYPE_SECCOMP &&
>> !capable(CAP_SYS_ADMIN))
>> return -EPERM;
>
> So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF?
>
No, this is specifically so non-init CAP_SYS_ADMIN cal load BPF
filters that are either socket_filter, cgroup_skb, or seccomp.

Re: [PATCH V6 2/4] sctp: Add ip option support

2018-02-16 Thread Neil Horman

On Fri, Feb 16, 2018 at 07:51:02PM -0200, Marcelo Ricardo Leitner wrote:
> On Fri, Feb 16, 2018 at 03:14:35PM -0500, Neil Horman wrote:
> > On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote:
> > > On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote:
> > > > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote:
> > > > > Add ip option support to allow LSM security modules to utilise 
> > > > > CIPSO/IPv4
> > > > > and CALIPSO/IPv6 services.
> > > > > 
> > > > > Signed-off-by: Richard Haines 
> > > > > ---
> > > > >  include/net/sctp/sctp.h|  4 +++-
> > > > >  include/net/sctp/structs.h |  2 ++
> > > > >  net/sctp/chunk.c   | 12 +++-
> > > > >  net/sctp/ipv6.c| 42 
> > > > > +++---
> > > > >  net/sctp/output.c  |  5 -
> > > > >  net/sctp/protocol.c| 36 
> > > > >  net/sctp/socket.c  | 14 ++
> > > > >  7 files changed, 97 insertions(+), 18 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > > > > index f7ae6b0..25c5c87 100644
> > > > > --- a/include/net/sctp/sctp.h
> > > > > +++ b/include/net/sctp/sctp.h
> > > > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct 
> > > > > list_head *head)
> > > > >  static inline int sctp_frag_point(const struct sctp_association 
> > > > > *asoc, int pmtu)
> > > > >  {
> > > > >   struct sctp_sock *sp = sctp_sk(asoc->base.sk);
> > > > > + struct sctp_af *af = sp->pf->af;
> > > > >   int frag = pmtu;
> > > > >  
> > > > > - frag -= sp->pf->af->net_header_len;
> > > > > + frag -= af->ip_options_len(asoc->base.sk);
> > > > > + frag -= af->net_header_len;
> > > > >   frag -= sizeof(struct sctphdr) + 
> > > > > sctp_datachk_len(>stream);
> > > > >  
> > > > >   if (asoc->user_frag)
> > > > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> > > > > index 03e92dd..ead5fce 100644
> > > > > --- a/include/net/sctp/structs.h
> > > > > +++ b/include/net/sctp/structs.h
> > > > > @@ -491,6 +491,7 @@ struct sctp_af {
> > > > >   void(*ecn_capable)(struct sock *sk);
> > > > >   __u16   net_header_len;
> > > > >   int sockaddr_len;
> > > > > + int (*ip_options_len)(struct sock *sk);
> > > > >   sa_family_t sa_family;
> > > > >   struct list_head list;
> > > > >  };
> > > > > @@ -515,6 +516,7 @@ struct sctp_pf {
> > > > >   int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr 
> > > > > *addr);
> > > > >   void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
> > > > >   void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
> > > > > + void (*copy_ip_options)(struct sock *sk, struct sock *newsk);
> > > > >   struct sctp_af *af;
> > > > >  };
> > > > >  
> > > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > > > > index 991a530..d5c0ef7 100644
> > > > > --- a/net/sctp/chunk.c
> > > > > +++ b/net/sctp/chunk.c
> > > > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct 
> > > > > sctp_datamsg *msg, struct sctp_chunk *chu
> > > > >   chunk->msg = msg;
> > > > >  }
> > > > >  
> > > > > -
> > > > >  /* A data chunk can have a maximum payload of (2^16 - 20).  Break
> > > > >   * down any such message into smaller chunks.  Opportunistically, 
> > > > > fragment
> > > > >   * the chunks down to the current MTU constraints.  We may get 
> > > > > refragmented
> > > > > @@ -171,6 +170,8 @@ struct sctp_datamsg 
> > > > > *sctp_datamsg_from_user(struct sctp_association *asoc,
> > > > >   struct list_head *pos, *temp;
> > > > >   struct sctp_chunk *chunk;
> > > > >   struct sctp_datamsg *msg;
> > > > > + struct sctp_sock *sp;
> > > > > + struct sctp_af *af;
> > > > >   int err;
> > > > >  
> > > > >   msg = sctp_datamsg_new(GFP_KERNEL);
> > > > > @@ -189,9 +190,11 @@ struct sctp_datamsg 
> > > > > *sctp_datamsg_from_user(struct sctp_association *asoc,
> > > > >   /* This is the biggest possible DATA chunk that can fit into
> > > > >* the packet
> > > > >*/
> > > > > - max_data = asoc->pathmtu -
> > > > > -sctp_sk(asoc->base.sk)->pf->af->net_header_len -
> > > > > -sizeof(struct sctphdr) - 
> > > > > sctp_datachk_len(>stream);
> > > > > + sp = sctp_sk(asoc->base.sk);
> > > > > + af = sp->pf->af;
> > > > > + max_data = asoc->pathmtu - af->net_header_len -
> > > > > +sizeof(struct sctphdr) - 
> > > > > sctp_datachk_len(>stream) -
> > > > > +af->ip_options_len(asoc->base.sk);
> > > > >   max_data = SCTP_TRUNC4(max_data);
> > > > >  
> > > > >   /* If the the peer requested that we authenticate DATA chunks
> > > > > @@ -211,7 +214,6 @@ struct sctp_datamsg 
> > > > > *sctp_datamsg_from_user(struct sctp_association *asoc,

[PATCH nf] netfilter: IDLETIMER: be syzkaller friendly

2018-02-16 Thread Eric Dumazet

From: Eric Dumazet 

We had one report from syzkaller [1]

First issue is that INIT_WORK() should be done before mod_timer()
or we risk timer being fired too soon, even with a 1 second timer.

Second issue is that we need to reject too big info->timeout
to avoid overflows in msecs_to_jiffies(info->timeout * 1000), or
risk looping, if result after overflow is 0.

[1]
WARNING: CPU: 1 PID: 5129 at kernel/workqueue.c:1444 __queue_work+0xdf4/0x1230 
kernel/workqueue.c:1444
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 5129 Comm: syzkaller159866 Not tainted 4.16.0-rc1+ #230
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:53
 panic+0x1e4/0x41c kernel/panic.c:183
 __warn+0x1dc/0x200 kernel/panic.c:547
 report_bug+0x211/0x2d0 lib/bug.c:184
 fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
 fixup_bug arch/x86/kernel/traps.c:247 [inline]
 do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
 invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:988
RIP: 0010:__queue_work+0xdf4/0x1230 kernel/workqueue.c:1444
RSP: 0018:8801db507538 EFLAGS: 00010006
RAX: 8801aeb46080 RBX: 8801db530200 RCX: 81481404
RDX: 0100 RSI: 86b42640 RDI: 0082
RBP: 8801db507758 R08: 11003b6a0de5 R09: 000c
R10: 8801db5073f0 R11: 0020 R12: 11003b6a0eb6
R13: 8801b1067ae0 R14: 01f8 R15: dc00
 queue_work_on+0x16a/0x1c0 kernel/workqueue.c:1488
 queue_work include/linux/workqueue.h:488 [inline]
 schedule_work include/linux/workqueue.h:546 [inline]
 idletimer_tg_expired+0x44/0x60 net/netfilter/xt_IDLETIMER.c:116
 call_timer_fn+0x228/0x820 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x7ee/0xb70 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:541 [inline]
 smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:829
 
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:777 [inline]
RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 
[inline]
RIP: 0010:_raw_spin_unlock_irqrestore+0x5e/0xba kernel/locking/spinlock.c:184
RSP: 0018:8801c20173c8 EFLAGS: 0282 ORIG_RAX: ff12
RAX: dc00 RBX: 0282 RCX: 0006
RDX: 10d592cd RSI: 110035d68d23 RDI: 0282
RBP: 8801c20173d8 R08: 110038402e47 R09: 
R10:  R11:  R12: 8820e5c8
R13: 8801b1067ad8 R14: 8801aea7c268 R15: 8801aea7c278
 __debug_object_init+0x235/0x1040 lib/debugobjects.c:378
 debug_object_init+0x17/0x20 lib/debugobjects.c:391
 __init_work+0x2b/0x60 kernel/workqueue.c:506
 idletimer_tg_create net/netfilter/xt_IDLETIMER.c:152 [inline]
 idletimer_tg_checkentry+0x691/0xb00 net/netfilter/xt_IDLETIMER.c:213
 xt_check_target+0x22c/0x7d0 net/netfilter/x_tables.c:850
 check_target net/ipv6/netfilter/ip6_tables.c:533 [inline]
 find_check_entry.isra.7+0x935/0xcf0 net/ipv6/netfilter/ip6_tables.c:575
 translate_table+0xf52/0x1690 net/ipv6/netfilter/ip6_tables.c:744
 do_replace net/ipv6/netfilter/ip6_tables.c:1160 [inline]
 do_ip6t_set_ctl+0x370/0x5f0 net/ipv6/netfilter/ip6_tables.c:1686
 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
 nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
 ipv6_setsockopt+0x10b/0x130 net/ipv6/ipv6_sockglue.c:927
 udpv6_setsockopt+0x45/0x80 net/ipv6/udp.c:1422
 sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2976
 SYSC_setsockopt net/socket.c:1850 [inline]
 SyS_setsockopt+0x189/0x360 net/socket.c:1829
 do_syscall_64+0x282/0x940 arch/x86/entry/common.c:287

Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation")
Signed-off-by: Eric Dumazet 
Reported-by: syzkaller 
---
 net/netfilter/xt_IDLETIMER.c |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 
6c2482b709b1eca341926a8393f45dfead358561..1ac6600bfafd60b6b4d5aaf88a41fb57c6ec195b
 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -146,11 +146,11 @@ static int idletimer_tg_create(struct idletimer_tg_info 
*info)
timer_setup(>timer->timer, idletimer_tg_expired, 0);
info->timer->refcnt = 1;
 
+   INIT_WORK(>timer->work, idletimer_tg_work);
+
mod_timer(>timer->timer,
  msecs_to_jiffies(info->timeout * 1000) + jiffies);
 
-   INIT_WORK(>timer->work,

Re: [RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available

2018-02-16 Thread Jakub Kicinski

On Fri, 16 Feb 2018 10:11:21 -0800, Sridhar Samudrala wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
> 
> The hypervisor needs to enable only one datapath at any time so that
> packets don't get looped back to the VM over the other datapath. When a VF
> is plugged, the virtio datapath link state can be marked as down. The
> hypervisor needs to unplug the VF device from the guest on the source host
> and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed,
> the destination hypervisor sets the MAC filter on the VF and plugs it back
> to the guest to switch over to VF datapath.
> 
> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> created that acts as a master device and tracks the state of the 2 lower
> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> passthru device with the same MAC is registered as 'active' netdev.
> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization=151189725224231=2
> 
> Signed-off-by: Sridhar Samudrala 
> Signed-off-by: Alexander Duyck  

> +static void
> +virtnet_bypass_get_stats(struct net_device *dev,
> +  struct rtnl_link_stats64 *stats)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + const struct rtnl_link_stats64 *new;
> + struct rtnl_link_stats64 temp;
> + struct net_device *child_netdev;
> +
> + spin_lock(>stats_lock);
> + memcpy(stats, >bypass_stats, sizeof(*stats));
> +
> + rcu_read_lock();
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, );
> + virtnet_bypass_fold_stats(stats, new, >active_stats);
> + memcpy(>active_stats, new, sizeof(*new));
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, );
> + virtnet_bypass_fold_stats(stats, new, >backup_stats);
> + memcpy(>backup_stats, new, sizeof(*new));
> + }
> +
> + rcu_read_unlock();
> +
> + memcpy(>bypass_stats, stats, sizeof(*stats));
> + spin_unlock(>stats_lock);
> +}
> +
> +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> + int ret = 0;
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + return ret;
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + netdev_err(child_netdev,
> +"Unexpected failure to set mtu to %d\n",
> +new_mtu);

You should probably unwind if set fails on one of the legs.

> + }
> +
> + dev->mtu = new_mtu;
> + return 0;
> +}

nit: stats, mtu, all those mundane things are implemented in team
 already.  If we had this as kernel-internal team mode we wouldn't
 have to reimplement them...  You probably did investigate that
 option, for my edification, would you mind saying what the
 challenges/downsides were?

> +static struct net_device *
> +get_virtnet_bypass_bymac(struct net *net, const u8 *mac)
> +{
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + if (dev->netdev_ops != _bypass_netdev_ops)
> + continue;   /* not a virtnet_bypass device */

Is there anything inherently wrong with enslaving another virtio dev
now?  I was expecting something like a hash map to map MAC addr ->
master and then one can check if dev is already enslaved to that master.
Just a random thought, I'm probably missing something...

> + if (ether_addr_equal(mac, dev->perm_addr))
> + return dev;
> + }
> +
> + return NULL;
> +}
> +
> +static struct net_device *
> +get_virtnet_bypass_byref(struct net_device *child_netdev)
> +{
> + struct net *net = dev_net(child_netdev);
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + struct virtnet_bypass_info *vbi;
> +
> + if (dev->netdev_ops != _bypass_netdev_ops)
> + continue;   /* not a virtnet_bypass device */
> +
> + vbi =

Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device

2018-02-16 Thread Jakub Kicinski

On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
> Ppatch 2 is in response to the community request for a 3 netdev
> solution.  However, it creates some issues we'll get into in a moment.
> It extends virtio_net to use alternate datapath when available and
> registered. When BACKUP feature is enabled, virtio_net driver creates
> an additional 'bypass' netdev that acts as a master device and controls
> 2 slave devices.  The original virtio_net netdev is registered as
> 'backup' netdev and a passthru/vf device with the same MAC gets
> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
> associated with the same 'pci' device.  The user accesses the network
> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
> as default for transmits when it is available with link up and running.

Thank you do doing this.

> We noticed a couple of issues with this approach during testing.
> - As both 'bypass' and 'backup' netdevs are associated with the same
>   virtio pci device, udev tries to rename both of them with the same name
>   and the 2nd rename will fail. This would be OK as long as the first netdev
>   to be renamed is the 'bypass' netdev, but the order in which udev gets
>   to rename the 2 netdevs is not reliable. 

Out of curiosity - why do you link the master netdev to the virtio
struct device?

FWIW two solutions that immediately come to mind is to export "backup"
as phys_port_name of the backup virtio link and/or assign a name to the
master like you are doing already.  I think team uses team%d and bond
uses bond%d, soft naming of master devices seems quite natural in this
case.

IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
link is quite neat.

> - When the 'active' netdev is unplugged OR not present on a destination
>   system after live migration, the user will see 2 virtio_net netdevs.

That's necessary and expected, all configuration applies to the master
so master must exist.

[patch net] mlxsw: spectrum_router: Do not unconditionally clear route offload indication

2018-02-16 Thread Jiri Pirko

From: Ido Schimmel 

When mlxsw replaces (or deletes) a route it removes the offload
indication from the replaced route. This is problematic for IPv4 routes,
as the offload indication is stored in the fib_info which is usually
shared between multiple routes.

Instead of unconditionally clearing the offload indication, only clear
it if no other route is using the fib_info.

Fixes: 3984d1a89fe7 ("mlxsw: spectrum_router: Provide offload indication using 
nexthop flags")
Signed-off-by: Ido Schimmel 
Reported-by: Alexander Petrovskiy 
Tested-by: Alexander Petrovskiy 
Signed-off-by: Jiri Pirko 
---
Dave, please, push to stable. Thanks!
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index dcc6305f7c22..f7948e983637 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3794,6 +3794,9 @@ mlxsw_sp_fib4_entry_offload_unset(struct 
mlxsw_sp_fib_entry *fib_entry)
struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
int i;
 
+   if (!list_is_singular(_grp->fib_list))
+   return;
+
for (i = 0; i < nh_grp->count; i++) {
struct mlxsw_sp_nexthop *nh = _grp->nexthops[i];
 
-- 
2.14.3

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

On pátek 16. února 2018 23:50:35 CET Eric Dumazet wrote:
> /* snip */
> If you use
> 
> tcptrace -R test_s2c.pcap
> xplot.org d2c_rtt.xpl
> 
> Then you'll see plenty of suspect 40ms rtt samples.

That's odd. Even the way how they look uniformly.

> It looks like receiver misses wakeups for some reason,
> and only the TCP delayed ACK timer is helping.
> 
> So it does not look like a sender side issue to me.

To make things even more complicated, I've disabled sg on the server, leaving 
it enabled on the client:

client to server flow: 935 Mbits/sec
server to client flow: 72.5 Mbits/sec

So still, to me it looks like a sender issue. No?

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

On Fri, Feb 16, 2018 at 2:50 PM, Oleksandr Natalenko
 wrote:
> Hi.
>
> On pátek 16. února 2018 21:54:05 CET Eric Dumazet wrote:
>> /* snip */
>> Something fishy really :
>> /* snip */
>> Not only the receiver suddenly adds a 25 ms delay, but also note that
>> it acknowledges all prior segments (ack 112949), but with a wrong ecr
>> value ( 2327043753 )
>> instead of 2327043759
>> /* snip */
>
> Eric has encouraged me to look closer at what's there in the ethtool, and I've
> just had a free time to play with it. I've found out that enabling scatter-
> gather (ethtool -K enp3s0 sg on, it is disabled by default on both hosts)
> brings the throughput back to normal even with BBR+fq_codel.
>
> Wh? What's the deal BBR has with sg?

Well, no effect  here on e1000e (1 Gbit) at least

# ethtool -K eth3 sg off
Actual changes:
scatter-gather: off
tx-scatter-gather: off
tcp-segmentation-offload: off
tx-tcp-segmentation: off [requested on]
tx-tcp6-segmentation: off [requested on]
generic-segmentation-offload: off [requested on]

# tc qd replace dev eth3 root pfifo_fast
# ./super_netperf 1 -H 7.7.7.84 -- -K cubic
941
# ./super_netperf 1 -H 7.7.7.84 -- -K bbr
941
# tc qd replace dev eth3 root fq
# ./super_netperf 1 -H 7.7.7.84 -- -K cubic
941
# ./super_netperf 1 -H 7.7.7.84 -- -K bbr
941
# tc qd replace dev eth3 root fq_codel
# ./super_netperf 1 -H 7.7.7.84 -- -K cubic
941
# ./super_netperf 1 -H 7.7.7.84 -- -K bbr
941
#

[PATCH net 3/3] net: qualcomm: rmnet: Fix possible null dereference in command processing

2018-02-16 Thread Subash Abhinov Kasiviswanathan

If a command packet with invalid mux id is received, the packet would
not have a valid endpoint. This invalid endpoint maybe dereferenced
leading to a crash. Identified by manual code inspection.

Fixes: 3352e6c45760 ("net: qualcomm: rmnet: Convert the muxed endpoint to 
hlist")
Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index 6bc328f..b0dbca0 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -38,6 +38,11 @@ static u8 rmnet_map_do_flow_control(struct sk_buff *skb,
}
 
ep = rmnet_get_endpoint(port, mux_id);
+   if (!ep) {
+   kfree_skb(skb);
+   return RX_HANDLER_CONSUMED;
+   }
+
vnd = ep->egress_dev;
 
ip_family = cmd->flow_control.ip_family;
-- 
1.9.1

[PATCH net 1/3] net: qualcomm: rmnet: Fix crash on real dev unregistration

2018-02-16 Thread Subash Abhinov Kasiviswanathan

With CONFIG_DEBUG_PREEMPT enabled, a crash with the following call
stack was observed when removing a real dev which had rmnet devices
attached to it.
To fix this, remove the netdev_upper link APIs and instead use the
existing information in rmnet_port and rmnet_priv to get the
association between real and rmnet devs.

BUG: sleeping function called from invalid context
in_atomic(): 0, irqs_disabled(): 0, pid: 5762, name: ip
Preemption disabled at:
[] debug_object_active_state+0xa4/0x16c
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
Modules linked in:
PC is at ___might_sleep+0x13c/0x180
LR is at ___might_sleep+0x17c/0x180
[] ___might_sleep+0x13c/0x180
[] __might_sleep+0x58/0x8c
[] mutex_lock+0x2c/0x48
[] kernfs_remove_by_name_ns+0x48/0xa8
[] sysfs_remove_link+0x30/0x58
[] __netdev_adjacent_dev_remove+0x14c/0x1e0
[] __netdev_adjacent_dev_unlink_lists+0x40/0x68
[] netdev_upper_dev_unlink+0xb4/0x1fc
[] rmnet_dev_walk_unreg+0x6c/0xc8
[] netdev_walk_all_lower_dev_rcu+0x58/0xb4
[] rmnet_config_notify_cb+0xf4/0x134
[] raw_notifier_call_chain+0x58/0x78
[] call_netdevice_notifiers_info+0x48/0x78
[] rollback_registered_many+0x230/0x3c8
[] unregister_netdevice_many+0x38/0x94
[] rtnl_delete_link+0x58/0x88
[] rtnl_dellink+0xbc/0x1cc
[] rtnetlink_rcv_msg+0xb0/0x244
[] netlink_rcv_skb+0xb4/0xdc
[] rtnetlink_rcv+0x34/0x44
[] netlink_unicast+0x1ec/0x294
[] netlink_sendmsg+0x320/0x390
[] sock_sendmsg+0x54/0x60
[] ___sys_sendmsg+0x298/0x2b0
[] SyS_sendmsg+0xb4/0xf0
[] el0_svc_naked+0x24/0x28

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial 
implementation")
Fixes: 60d58f971c10 ("net: qualcomm: rmnet: Implement bridge mode")
Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 68 +-
 1 file changed, 14 insertions(+), 54 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 7e7704d..c494918 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -43,12 +43,6 @@
 
 /* Local Definitions and Declarations */
 
-struct rmnet_walk_data {
-   struct net_device *real_dev;
-   struct list_head *head;
-   struct rmnet_port *port;
-};
-
 static int rmnet_is_real_dev_registered(const struct net_device *real_dev)
 {
return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler;
@@ -112,17 +106,14 @@ static int rmnet_register_real_device(struct net_device 
*real_dev)
 static void rmnet_unregister_bridge(struct net_device *dev,
struct rmnet_port *port)
 {
-   struct net_device *rmnet_dev, *bridge_dev;
struct rmnet_port *bridge_port;
+   struct net_device *bridge_dev;
 
if (port->rmnet_mode != RMNET_EPMODE_BRIDGE)
return;
 
/* bridge slave handling */
if (!port->nr_rmnet_devs) {
-   rmnet_dev = netdev_master_upper_dev_get_rcu(dev);
-   netdev_upper_dev_unlink(dev, rmnet_dev);
-
bridge_dev = port->bridge_ep;
 
bridge_port = rmnet_get_port_rtnl(bridge_dev);
@@ -132,9 +123,6 @@ static void rmnet_unregister_bridge(struct net_device *dev,
bridge_dev = port->bridge_ep;
 
bridge_port = rmnet_get_port_rtnl(bridge_dev);
-   rmnet_dev = netdev_master_upper_dev_get_rcu(bridge_dev);
-   netdev_upper_dev_unlink(bridge_dev, rmnet_dev);
-
rmnet_unregister_real_device(bridge_dev, bridge_port);
}
 }
@@ -173,10 +161,6 @@ static int rmnet_newlink(struct net *src_net, struct 
net_device *dev,
if (err)
goto err1;
 
-   err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack);
-   if (err)
-   goto err2;
-
port->rmnet_mode = mode;
 
hlist_add_head_rcu(>hlnode, >muxed_ep[mux_id]);
@@ -193,8 +177,6 @@ static int rmnet_newlink(struct net *src_net, struct 
net_device *dev,
 
return 0;
 
-err2:
-   rmnet_vnd_dellink(mux_id, port, ep);
 err1:
rmnet_unregister_real_device(real_dev, port);
 err0:
@@ -204,14 +186,13 @@ static int rmnet_newlink(struct net *src_net, struct 
net_device *dev,
 
 static void rmnet_dellink(struct net_device *dev, struct list_head *head)
 {
+   struct rmnet_priv *priv = netdev_priv(dev);
struct net_device *real_dev;
struct rmnet_endpoint *ep;
struct rmnet_port *port;
u8 mux_id;
 
-   rcu_read_lock();
-   real_dev = netdev_master_upper_dev_get_rcu(dev);
-   rcu_read_unlock();
+   real_dev = priv->real_dev;
 
if (!real_dev || !rmnet_is_real_dev_registered(real_dev))
return;
@@ -219,7 +200,6 @@ static void rmnet_dellink(struct net_device *dev, struct 
list_head *head)
port = rmnet_get_port_rtnl(real_dev);
 
mux_id = rmnet_vnd_get_mux(dev);
-

[PATCH net 2/3] net: qualcomm: rmnet: Fix warning seen with 64 bit stats

2018-02-16 Thread Subash Abhinov Kasiviswanathan

With CONFIG_DEBUG_PREEMPT enabled, a warning was seen on device
creation. This occurs due to the incorrect cpu API usage in
ndo_get_stats64 handler.

BUG: using smp_processor_id() in preemptible [] code: rmnetcli/5743
caller is debug_smp_processor_id+0x1c/0x24
Call trace:
[] dump_backtrace+0x0/0x2a8
[] show_stack+0x20/0x28
[] dump_stack+0xa8/0xe0
[] check_preemption_disabled+0x104/0x108
[] debug_smp_processor_id+0x1c/0x24
[] rmnet_get_stats64+0x64/0x13c
[] dev_get_stats+0x68/0xd8
[] rtnl_fill_stats+0x54/0x140
[] rtnl_fill_ifinfo+0x428/0x9cc
[] rtmsg_ifinfo_build_skb+0x80/0xf4
[] rtnetlink_event+0x88/0xb4
[] raw_notifier_call_chain+0x58/0x78
[] call_netdevice_notifiers_info+0x48/0x78
[] __netdev_upper_dev_link+0x290/0x5e8
[] netdev_master_upper_dev_link+0x3c/0x48
[] rmnet_newlink+0xf0/0x1c8
[] rtnl_newlink+0x57c/0x6c8
[] rtnetlink_rcv_msg+0xb0/0x244
[] netlink_rcv_skb+0xb4/0xdc
[] rtnetlink_rcv+0x34/0x44
[] netlink_unicast+0x1ec/0x294
[] netlink_sendmsg+0x320/0x390
[] sock_sendmsg+0x54/0x60
[] SyS_sendto+0x1a0/0x1e4
[] el0_svc_naked+0x24/0x28

Fixes: 192c4b5d48f2 ("net: qualcomm: rmnet: Add support for 64 bit stats")
Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c 
b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index 570a227..346d310 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -121,7 +121,7 @@ static void rmnet_get_stats64(struct net_device *dev,
memset(_stats, 0, sizeof(struct rmnet_vnd_stats));
 
for_each_possible_cpu(cpu) {
-   pcpu_ptr = this_cpu_ptr(priv->pcpu_stats);
+   pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu);
 
do {
start = u64_stats_fetch_begin_irq(_ptr->syncp);
-- 
1.9.1

[PATCH net 0/3] net: qualcomm: rmnet: Fix issues with CONFIG_DEBUG_PREEMPT enabled

2018-02-16 Thread Subash Abhinov Kasiviswanathan

Patch 1 and 2 fixes issues identified when CONFIG_DEBUG_PREEMPT was
enabled. These involve APIs which were called in invalid contexts.

Patch 3 is a null derefence fix identified by code inspection.


Subash Abhinov Kasiviswanathan (3):
  net: qualcomm: rmnet: Fix crash on real dev unregistration
  net: qualcomm: rmnet: Fix warning seen with 64 bit stats
  net: qualcomm: rmnet: Fix possible null dereference in command
processing

 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 68 +-
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c|  5 ++
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c|  2 +-
 3 files changed, 20 insertions(+), 55 deletions(-)

-- 
1.9.1

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi.

On pátek 16. února 2018 21:54:05 CET Eric Dumazet wrote:
> /* snip */
> Something fishy really :
> /* snip */
> Not only the receiver suddenly adds a 25 ms delay, but also note that
> it acknowledges all prior segments (ack 112949), but with a wrong ecr
> value ( 2327043753 )
> instead of 2327043759
> /* snip */

Eric has encouraged me to look closer at what's there in the ethtool, and I've 
just had a free time to play with it. I've found out that enabling scatter-
gather (ethtool -K enp3s0 sg on, it is disabled by default on both hosts) 
brings the throughput back to normal even with BBR+fq_codel.

Wh? What's the deal BBR has with sg?

Oleksandr

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

On Fri, 2018-02-16 at 12:54 -0800, Eric Dumazet wrote:
> On Fri, Feb 16, 2018 at 9:25 AM, Oleksandr Natalenko
>  wrote:
> > Hi.
> > 
> > On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote:
> > > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We
> > > have not run into this one in our team, but we will try to work with you 
> > > to
> > > fix this.
> > > 
> > > Would you be able to take a sender-side tcpdump trace of the slow BBR
> > > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would 
> > > be
> > > fine. Maybe something like:
> > > 
> > >   tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT
> > 
> > So, going on with two real HW hosts. They are both running latest stock Arch
> > Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are
> > interconnected with 1 Gbps link (via switch if that matters). Using iperf3,
> > running each test for 20 seconds.
> > 
> > Having BBR+fq_codel (or pfifo_fast, same result) on both hosts:
> > 
> > Client to server: 112 Mbits/sec
> > Server to client: 96.1 Mbits/sec
> > 
> > Having BBR+fq on both hosts:
> > 
> > Client to server: 347 Mbits/sec
> > Server to client: 397 Mbits/sec
> > 
> > Having YeAH+fq on both hosts:
> > [1] https://natalenko.name/myfiles/bbr/
> > 
> 
> Something fishy really :
> 
> 09:18:31.449903 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
> seq 76745:79641, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508870], length 2896
> 09:18:31.449916 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
> ack 79641, win 1011, options [nop,nop,TS val 3190508870 ecr
> 2327043753], length 0
> 09:18:31.449925 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 79641:83985, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508870], length 4344
> 09:18:31.449936 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
> ack 83985, win 987, options [nop,nop,TS val 3190508870 ecr
> 2327043753], length 0
> 09:18:31.450112 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 83985:86881, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508870], length 2896
> 09:18:31.450124 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
> ack 86881, win 971, options [nop,nop,TS val 3190508871 ecr
> 2327043753], length 0
> 09:18:31.450299 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 86881:91225, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508870], length 4344
> 09:18:31.450313 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
> ack 91225, win 947, options [nop,nop,TS val 3190508871 ecr
> 2327043753], length 0
> 09:18:31.450491 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
> seq 91225:92673, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508870], length 1448
> 09:18:31.450505 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 92673:94121, ack 38, win 227, options [nop,nop,TS val 2327043753
> ecr 3190508871], length 1448
> 09:18:31.450511 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
> seq 94121:95569, ack 38, win 227, options [nop,nop,TS val 2327043754
> ecr 3190508871], length 1448
> 09:18:31.450720 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 95569:101361, ack 38, win 227, options [nop,nop,TS val 2327043754
> ecr 3190508871], length 5792
> 09:18:31.450932 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 101361:105705, ack 38, win 227, options [nop,nop,TS val 2327043754
> ecr 3190508871], length 4344
> 09:18:31.451132 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 105705:110049, ack 38, win 227, options [nop,nop,TS val 2327043754
> ecr 3190508871], length 4344
> 09:18:31.451342 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 110049:111497, ack 38, win 227, options [nop,nop,TS val 2327043754
> ecr 3190508871], length 1448
> 09:18:31.455841 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
> seq 111497:112945, ack 38, win 227, options [nop,nop,TS val 2327043759
> ecr 3190508871], length 1448
> 
> Not only the receiver suddenly adds a 25 ms delay, but also note that
> it acknowledges all prior segments (ack 112949), but with a wrong ecr
> value ( 2327043753 )
> instead of 2327043759

If you use 

tcptrace -R test_s2c.pcap
xplot.org d2c_rtt.xpl

Then you'll see plenty of suspect 40ms rtt samples.

It looks like receiver misses wakeups for some reason,
and only the TCP delayed ACK timer is helping.

So it does not look like a sender side issue to me.

Re: [PATCH] tun: fix mismatch in mutex lock-unlock in tun_get_user()

2018-02-16 Thread Eric Dumazet

On Fri, Feb 16, 2018 at 2:11 PM, Alexey Khoroshilov
 wrote:
> There is a single error path where tfile->napi_mutex is left unlocked.
> It can lead to a deadlock.
>
> Found by Linux Driver Verification project (linuxtesting.org).
>
> Signed-off-by: Alexey Khoroshilov 
> ---
>  drivers/net/tun.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 81e6cc951e7f..0072a9832532 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1879,6 +1879,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
> struct tun_file *tfile,
> default:
> this_cpu_inc(tun->pcpu_stats->rx_dropped);
> kfree_skb(skb);
> +   if (frags) {
> +   tfile->napi.skb = NULL;
> +   mutex_unlock(>napi_mutex);
> +   }
> return -EINVAL;


I do not believe this can happen for IFF_TUN

IFF_NAPI_FRAGS can only be set for IFF_TAP

Re: [PATCH RFC 0/4] net: add bpfilter

2018-02-16 Thread David Miller

From: Florian Westphal 
Date: Fri, 16 Feb 2018 17:14:08 +0100

> Any particular reason why translating iptables rather than nftables
> (it should be possible to monitor the nftables changes that are
>  announced by kernel and act on those)?

As Daniel said, iptables is by far the most deployed of the two
technologies.  Therefore it provides the largest environment for
testing and coverage.

Re: [PATCH RFC 0/4] net: add bpfilter

2018-02-16 Thread David Miller

From: Florian Westphal 
Date: Fri, 16 Feb 2018 15:57:27 +0100

> 4. Do you plan to reimplement connection tracking in userspace?
> If no, how will the bpf program interact with it?

The natural way to handle this, as with anything BPF related, is with
appropriate BPF helpers which would be added for this purpose.

[PATCH] tun: fix mismatch in mutex lock-unlock in tun_get_user()

2018-02-16 Thread Alexey Khoroshilov

There is a single error path where tfile->napi_mutex is left unlocked.
It can lead to a deadlock.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov 
---
 drivers/net/tun.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 81e6cc951e7f..0072a9832532 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1879,6 +1879,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
default:
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
+   if (frags) {
+   tfile->napi.skb = NULL;
+   mutex_unlock(>napi_mutex);
+   }
return -EINVAL;
}
}
-- 
2.7.4

Re: [PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0

2018-02-16 Thread David Ahern

On 2/16/18 2:43 PM, David Miller wrote:
> From: David Ahern 
> Date: Fri, 16 Feb 2018 11:03:03 -0800
> 
>> Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings
>> if the index is actually set in the message.
>>
>> Signed-off-by: David Ahern 
> 
> Ok, this behavior meets reasonable expectations, applied, thanks.
> 
> None of the documation is clear about this relationship between
> ip_pktinfo's ifindex and settings made by SO_BINDTODEVICE.
> 

It is my understanding that SO_BINDTODEVICE is the strongest -- it
requires admin to set. From there IP_PKTINFO and IP_UNICAST_IF are
non-root options and hence weaker. If that is the proper expectation,
then the right thing to do is probably to error out if ipc.oif is
already set. I was concerned that would break existing apps, so this
seemed to be a compromise.

Re: [PATCH net-next] tun: export flags, uid, gid, queue information over netlink

2018-02-16 Thread David Miller

From: Sabrina Dubroca 
Date: Fri, 16 Feb 2018 11:03:07 +0100

> Signed-off-by: Sabrina Dubroca 
> Reviewed-by: Stefano Brivio 

Looks good, applied, thanks Sabrina.

Re: [PATCH V6 2/4] sctp: Add ip option support

2018-02-16 Thread Marcelo Ricardo Leitner

On Fri, Feb 16, 2018 at 03:14:35PM -0500, Neil Horman wrote:
> On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote:
> > On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote:
> > > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote:
> > > > Add ip option support to allow LSM security modules to utilise 
> > > > CIPSO/IPv4
> > > > and CALIPSO/IPv6 services.
> > > > 
> > > > Signed-off-by: Richard Haines 
> > > > ---
> > > >  include/net/sctp/sctp.h|  4 +++-
> > > >  include/net/sctp/structs.h |  2 ++
> > > >  net/sctp/chunk.c   | 12 +++-
> > > >  net/sctp/ipv6.c| 42 
> > > > +++---
> > > >  net/sctp/output.c  |  5 -
> > > >  net/sctp/protocol.c| 36 
> > > >  net/sctp/socket.c  | 14 ++
> > > >  7 files changed, 97 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > > > index f7ae6b0..25c5c87 100644
> > > > --- a/include/net/sctp/sctp.h
> > > > +++ b/include/net/sctp/sctp.h
> > > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct 
> > > > list_head *head)
> > > >  static inline int sctp_frag_point(const struct sctp_association *asoc, 
> > > > int pmtu)
> > > >  {
> > > > struct sctp_sock *sp = sctp_sk(asoc->base.sk);
> > > > +   struct sctp_af *af = sp->pf->af;
> > > > int frag = pmtu;
> > > >  
> > > > -   frag -= sp->pf->af->net_header_len;
> > > > +   frag -= af->ip_options_len(asoc->base.sk);
> > > > +   frag -= af->net_header_len;
> > > > frag -= sizeof(struct sctphdr) + 
> > > > sctp_datachk_len(>stream);
> > > >  
> > > > if (asoc->user_frag)
> > > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> > > > index 03e92dd..ead5fce 100644
> > > > --- a/include/net/sctp/structs.h
> > > > +++ b/include/net/sctp/structs.h
> > > > @@ -491,6 +491,7 @@ struct sctp_af {
> > > > void(*ecn_capable)(struct sock *sk);
> > > > __u16   net_header_len;
> > > > int sockaddr_len;
> > > > +   int (*ip_options_len)(struct sock *sk);
> > > > sa_family_t sa_family;
> > > > struct list_head list;
> > > >  };
> > > > @@ -515,6 +516,7 @@ struct sctp_pf {
> > > > int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr 
> > > > *addr);
> > > > void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
> > > > void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
> > > > +   void (*copy_ip_options)(struct sock *sk, struct sock *newsk);
> > > > struct sctp_af *af;
> > > >  };
> > > >  
> > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > > > index 991a530..d5c0ef7 100644
> > > > --- a/net/sctp/chunk.c
> > > > +++ b/net/sctp/chunk.c
> > > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct sctp_datamsg 
> > > > *msg, struct sctp_chunk *chu
> > > > chunk->msg = msg;
> > > >  }
> > > >  
> > > > -
> > > >  /* A data chunk can have a maximum payload of (2^16 - 20).  Break
> > > >   * down any such message into smaller chunks.  Opportunistically, 
> > > > fragment
> > > >   * the chunks down to the current MTU constraints.  We may get 
> > > > refragmented
> > > > @@ -171,6 +170,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > > sctp_association *asoc,
> > > > struct list_head *pos, *temp;
> > > > struct sctp_chunk *chunk;
> > > > struct sctp_datamsg *msg;
> > > > +   struct sctp_sock *sp;
> > > > +   struct sctp_af *af;
> > > > int err;
> > > >  
> > > > msg = sctp_datamsg_new(GFP_KERNEL);
> > > > @@ -189,9 +190,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > > sctp_association *asoc,
> > > > /* This is the biggest possible DATA chunk that can fit into
> > > >  * the packet
> > > >  */
> > > > -   max_data = asoc->pathmtu -
> > > > -  sctp_sk(asoc->base.sk)->pf->af->net_header_len -
> > > > -  sizeof(struct sctphdr) - 
> > > > sctp_datachk_len(>stream);
> > > > +   sp = sctp_sk(asoc->base.sk);
> > > > +   af = sp->pf->af;
> > > > +   max_data = asoc->pathmtu - af->net_header_len -
> > > > +  sizeof(struct sctphdr) - 
> > > > sctp_datachk_len(>stream) -
> > > > +  af->ip_options_len(asoc->base.sk);
> > > > max_data = SCTP_TRUNC4(max_data);
> > > >  
> > > > /* If the the peer requested that we authenticate DATA chunks
> > > > @@ -211,7 +214,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > > sctp_association *asoc,
> > > >  
> > > > /* Set first_len and then account for possible bundles on first 
> > > > frag */
> > > > first_len = max_data;
> > > > -
> > > > /* Check to see if we have a

Re: [PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0

2018-02-16 Thread David Miller

From: David Ahern 
Date: Fri, 16 Feb 2018 11:03:03 -0800

> Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings
> if the index is actually set in the message.
> 
> Signed-off-by: David Ahern 

Ok, this behavior meets reasonable expectations, applied, thanks.

None of the documation is clear about this relationship between
ip_pktinfo's ifindex and settings made by SO_BINDTODEVICE.

Re: [PATCH][V2] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift

2018-02-16 Thread David Miller

From: Colin King 
Date: Fri, 16 Feb 2018 16:55:05 +

> From: Colin Ian King 
> 
> The shifting of timehi by 16 bits to the left will be promoted to
> a 32 bit signed int and then sign-extended to an u64. If the top bit
> of timehi is set then all then all the upper bits of ns end up as also
> being set because of the sign-extension. Fix this by making timehi and
> timelo u64.  Also move the declaration of ns.
> 
> Detected by CoverityScan, CID#1465288 ("Unintended sign extension")
> 
> Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support")
> Signed-off-by: Colin Ian King 

Please indicate the appropriate target tree in your Subject lines
in the future, for this it should be "[PATCH net-next]".

Applied, thanks.

Re: [PATCH v2] ravb: add support for changing MTU

2018-02-16 Thread David Miller

From: Niklas Söderlund 
Date: Fri, 16 Feb 2018 17:10:08 +0100

> Allow for changing the MTU within the limit of the maximum size of a
> descriptor (2048 bytes). Add the callback to change MTU from user-space
> and take the configurable MTU into account when configuring the
> hardware.
> 
> Signed-off-by: Niklas Söderlund 

Applied.

However, most drivers support MTU change while the interface is up and
frankly that is what is most useful for users.

Re: [RFC PATCH] ptr_ring: linked list fallback

2018-02-16 Thread David Miller

From: "Michael S. Tsirkin" 
Date: Fri, 16 Feb 2018 09:40:54 +0200

> So pointer rings work fine, but they have a problem:
> make them too small and not enough entries fit.
> Make them too large and you start flushing your cache
> and running out of memory.
> 
> This is a new idea of mine: a ring backed by a
> linked list. Once you run out of rin entries,
> instead of a drop you fall back on a list with
> a common lock.
> 
> Should work well for the case where the ring is typically sized
> correctly, but will help address the fact that some user try to set e.g.
> tx queue length to 100.
> 
> My hope this will move us closer to direction where e.g. fw codel can
> use ptr rings without locking at all.
> The API is still very rough, and I really need to take a hard look
> at lock nesting.
> 
> Completely untested, sending for early feedback/flames.
> 
> Signed-off-by: Michael S. Tsirkin 

So the idea is that if a user sets a really huge TX queue length, we allocate
a ptr_ring which is smaller, and use the backup linked list when necessary
to provide the requested TX queue length legitimately.

Right?

Re: [PATCH net] sctp: remove the left unnecessary check for chunk in sctp_renege_events

2018-02-16 Thread David Miller

From: Xin Long 
Date: Fri, 16 Feb 2018 17:18:33 +0800

> Commit fb23403536ea ("sctp: remove the useless check in
> sctp_renege_events") forgot to remove another check for
> chunk in sctp_renege_events.
> 
> Dan found this when doing a static check.
> 
> This patch is to remove that check, and also to merge
> two checks into one 'if statement'.
> 
> Fixes: fb23403536ea ("sctp: remove the useless check in sctp_renege_events")
> Reported-by: Dan Carpenter 
> Signed-off-by: Xin Long 

Applied, thank you.

Re: [PATCH net] tg3: APE heartbeat changes

2018-02-16 Thread David Miller

From: Satish Baddipadige 
Date: Fri, 16 Feb 2018 10:01:29 +0530

> @@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, 
> int kind)
>   tg3_ape_send_event(tp, event);
>  }
>  
> +static inline void tg3_send_ape_heartbeat(struct tg3 *tp,

Inline functions are not appropriate in foo.c files, please drop the
inline keyword and let the compiler device.

Re: [PATCH net-next 0/2] nfp: whitespace sync and flower TCP flags

2018-02-16 Thread David Miller

From: Jakub Kicinski 
Date: Thu, 15 Feb 2018 20:19:07 -0800

> Whitespace cleanup from Michael and flower offload support for matching
> on TCP flags from Pieter.

Series applied, thanks Jakub.

Re: [PATCH net] rxrpc: Work around usercopy check

2018-02-16 Thread David Miller

From: David Howells 
Date: Thu, 15 Feb 2018 22:59:00 +

> Due to a check recently added to copy_to_user(), it's now not permitted to
> copy from slab-held data to userspace unless the slab is whitelisted.  This
> affects rxrpc_recvmsg() when it attempts to place an RXRPC_USER_CALL_ID
> control message in the userspace control message buffer.  A warning is
> generated by usercopy_warn() because the source is the copy of the
> user_call_ID retained in the rxrpc_call struct.
> 
> Work around the issue by copying the user_call_ID to a variable on the
> stack and passing that to put_cmsg().
> 
> The warning generated looks like:
 ...
> Reported-by: Jonathan Billings 
> Signed-off-by: David Howells 
> Acked-by: Kees Cook 
> Tested-by: Jonathan Billings 

Applied, thanks David.

Re: [PATCH net] tun: fix tun_napi_alloc_frags() frag allocator

2018-02-16 Thread David Miller

From: Eric Dumazet 
Date: Thu, 15 Feb 2018 14:47:15 -0800

> From: Eric Dumazet 
> 
> 
> While fuzzing arm64 v4.16-rc1 with Syzkaller, I've been hitting a
> misaligned atomic in __skb_clone:
> 
> atomic_inc(&(skb_shinfo(skb)->dataref));
> 
>where dataref doesn't have the required natural alignment, and the
>atomic operation faults. e.g. i often see it aligned to a single
>byte boundary rather than a four byte boundary.
> 
>AFAICT, the skb_shared_info is misaligned at the instant it's
>allocated in __napi_alloc_skb()  __napi_alloc_skb()
> 
> 
> Problem is caused by tun_napi_alloc_frags() using
> napi_alloc_frag() with user provided seg sizes,
> leading to other users of this API getting unaligned
> page fragments.
> 
> Since we would like to not necessarily add paddings or alignments to
> the frags that tun_napi_alloc_frags() attaches to the skb, switch to
> another page frag allocator.
> 
> As a bonus skb_page_frag_refill() can use GFP_KERNEL allocations,
> meaning that we can not deplete memory reserves as easily.
> 
> Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
> Signed-off-by: Eric Dumazet 
> Reported-by: Mark Rutland 
> Tested-by: Mark Rutland 

Applied and queued up for -stable, thanks Eric.

Re: [PATCH v3 net-next 0/7] RDS: zerocopy support

2018-02-16 Thread David Miller

From: Sowmini Varadhan 
Date: Thu, 15 Feb 2018 10:49:31 -0800

> This is version 3 of the series, following up on review comments for
>  http://patchwork.ozlabs.org/project/netdev/list/?series=28530
> 
> Review comments addressed
> Patch 4
>   - fix fragile use of skb->cb[], do not set ee_code incorrectly.
> Patch 5:
>   - remove needless bzero of skb->cb[], consolidate err cleanup
> 
> A brief overview of this feature follows.
> 
> This patch series provides support for MSG_ZERCOCOPY
> on a PF_RDS socket based on the APIs and infrastructure added
> by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY")
> 
> For single threaded rds-stress testing using rds-tcp with the
> ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
> results show that  there is a significant reduction in latency: about
> 90 usec with zerocopy, compared with 200 usec without zerocopy.
> 
> This patchset modifies the above for zerocopy in the following manner.
> - if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
> - if the SO_ZEROCOPY  socket option has been set on the PF_RDS socket,
> application pages sent down with rds_sendmsg are pinned. The pinning
> uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit 
> on MSG_ZEROCOPY pages"). The message is unpinned when all references
> to the message go down to 0, and the message is freed by rds_message_purge.
> 
> A multithreaded application using this infrastructure must send down 
> a unique 32 bit cookie as ancillary data with each sendmsg invocation.
> The format of this ancillary data is described in Patch 5 of the series.
> The cookie is passed up to the application on the sk_error_queue when 
> the message is unpinned, indicating to the application that it is now
> safe to free/reuse the message buffer. The details of the completion
> notification are provided in Patch 4 of this series.

Series applied, thanks a lot!

Re: [PATCH v1 1/1] drivers: isdn: NULL pointer dereference [null-pointer-deref] (CWE 476) problem

2018-02-16 Thread David Miller

From: Joe Moriarty 
Date: Thu, 15 Feb 2018 15:27:00 -0500

> The Parfait (version 2.1.0) static code analysis tool found the
> following NULL pointer dereference problem.
> 
> - drivers/isdn/mISDN/core.c
> function channelmap_show() does not check the returned mdev
> variable from dev_to_mISDN() for NULL.  Added the check for
> NULL, which results in a value of 0 being returned.
> 
> Signed-off-by: Joe Moriarty 
> Reviewed-by: Jonathan Helman 

Since the lifetime for the sysfs files for these devices is strictly
smaller than the lifetime of the 'dev' object and it's driver data,
it is not possible for mdev to be NULL in this situation.

I understand what the static checker is trying to do, but within
context this lack of a NULL check is legitimate.

I'm not applying this, sorry.

Re: [PATCH net] udplite: fix partial checksum initialization

2018-02-16 Thread David Miller

From: Alexey Kodanev 
Date: Thu, 15 Feb 2018 20:18:43 +0300

> Since UDP-Lite is always using checksum, the following path is
> triggered when calculating pseudo header for it:
> 
>   udp4_csum_init() or udp6_csum_init()
> skb_checksum_init_zero_check()
>   __skb_checksum_validate_complete()
> 
> The problem can appear if skb->len is less than CHECKSUM_BREAK. In
> this particular case __skb_checksum_validate_complete() also invokes
> __skb_checksum_complete(skb). If UDP-Lite is using partial checksum
> that covers only part of a packet, the function will return bad
> checksum and the packet will be dropped.
> 
> It can be fixed if we skip skb_checksum_init_zero_check() and only
> set the required pseudo header checksum for UDP-Lite with partial
> checksum before udp4_csum_init()/udp6_csum_init() functions return.
> 
> Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4")
> Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6")
> Signed-off-by: Alexey Kodanev 

Applied and queued up for -stable, thanks Alexey.

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

On Fri, Feb 16, 2018 at 9:25 AM, Oleksandr Natalenko
 wrote:
> Hi.
>
> On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote:
>> Thanks for the detailed report! Yes, this sounds like an issue in BBR. We
>> have not run into this one in our team, but we will try to work with you to
>> fix this.
>>
>> Would you be able to take a sender-side tcpdump trace of the slow BBR
>> transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be
>> fine. Maybe something like:
>>
>>   tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT
>
> So, going on with two real HW hosts. They are both running latest stock Arch
> Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are
> interconnected with 1 Gbps link (via switch if that matters). Using iperf3,
> running each test for 20 seconds.
>
> Having BBR+fq_codel (or pfifo_fast, same result) on both hosts:
>
> Client to server: 112 Mbits/sec
> Server to client: 96.1 Mbits/sec
>
> Having BBR+fq on both hosts:
>
> Client to server: 347 Mbits/sec
> Server to client: 397 Mbits/sec
>
> Having YeAH+fq on both hosts:

> [1] https://natalenko.name/myfiles/bbr/
>

Something fishy really :

09:18:31.449903 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
seq 76745:79641, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508870], length 2896
09:18:31.449916 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 79641, win 1011, options [nop,nop,TS val 3190508870 ecr
2327043753], length 0
09:18:31.449925 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 79641:83985, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508870], length 4344
09:18:31.449936 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 83985, win 987, options [nop,nop,TS val 3190508870 ecr
2327043753], length 0
09:18:31.450112 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 83985:86881, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508870], length 2896
09:18:31.450124 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 86881, win 971, options [nop,nop,TS val 3190508871 ecr
2327043753], length 0
09:18:31.450299 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 86881:91225, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508870], length 4344
09:18:31.450313 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 91225, win 947, options [nop,nop,TS val 3190508871 ecr
2327043753], length 0
09:18:31.450491 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
seq 91225:92673, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508870], length 1448
09:18:31.450505 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 92673:94121, ack 38, win 227, options [nop,nop,TS val 2327043753
ecr 3190508871], length 1448
09:18:31.450511 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.],
seq 94121:95569, ack 38, win 227, options [nop,nop,TS val 2327043754
ecr 3190508871], length 1448
09:18:31.450720 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 95569:101361, ack 38, win 227, options [nop,nop,TS val 2327043754
ecr 3190508871], length 5792
09:18:31.450932 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 101361:105705, ack 38, win 227, options [nop,nop,TS val 2327043754
ecr 3190508871], length 4344
09:18:31.451132 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 105705:110049, ack 38, win 227, options [nop,nop,TS val 2327043754
ecr 3190508871], length 4344
09:18:31.451342 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 110049:111497, ack 38, win 227, options [nop,nop,TS val 2327043754
ecr 3190508871], length 1448
09:18:31.455841 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 111497:112945, ack 38, win 227, options [nop,nop,TS val 2327043759
ecr 3190508871], length 1448

Not only the receiver suddenly adds a 25 ms delay, but also note that
it acknowledges all prior segments (ack 112949), but with a wrong ecr
value ( 2327043753 )
instead of 2327043759

09:18:31.482238 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 112945, win , options [nop,nop,TS val 3190508903 ecr
2327043753], length 0
09:18:31.482704 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 112945:114393, ack 38, win 227, options [nop,nop,TS val 2327043786
ecr 3190508903], length 1448
09:18:31.482734 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 114393, win 1134, options [nop,nop,TS val 3190508903 ecr
2327043786], length 0
09:18:31.482802 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 114393:117289, ack 38, win 227, options [nop,nop,TS val 2327043786
ecr 3190508903], length 2896
09:18:31.482813 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.],
ack 117289, win 1179, options [nop,nop,TS val 3190508903 ecr
2327043786], length 0
09:18:31.483138 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.],
seq 117289:120185, ack 38, win 227, options [nop,nop,TS val 2327043786
ecr 3190508903], length 2896
09:18:31.483158 IP 172.29.28.55.14936

[PATCH] skbuff: Fix comment mis-spelling.

2018-02-16 Thread David Miller


'peform' --> 'perform'

Signed-off-by: David S. Miller 
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ebc0f869720..c1e66bdcf583 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3646,7 +3646,7 @@ static inline bool __skb_checksum_validate_needed(struct 
sk_buff *skb,
return true;
 }
 
-/* For small packets <= CHECKSUM_BREAK peform checksum complete directly
+/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
  * in checksum_init.
  */
 #define CHECKSUM_BREAK 76
-- 
2.14.3

Re: [PATCH net] dn_getsockoptdecnet: move nf_{get/set}sockopt outside sock lock

2018-02-16 Thread David Miller

From: Paolo Abeni 
Date: Thu, 15 Feb 2018 16:59:49 +0100

> After commit 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock
> only in the required scope"), the caller of nf_{get/set}sockopt() must
> not hold any lock, but, in such changeset, I forgot to cope with DECnet.
> 
> This commit addresses the issue moving the nf call outside the lock,
> in the dn_{get,set}sockopt() with the same schema currently used by
> ipv4 and ipv6. Also moves the unhandled sockopts of the end of the main
> switch statements, to improve code readability.
> 
> Reported-by: Petr Vandrovec 
> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=198791#c2
> Fixes: 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock only in the 
> required scope")
> Signed-off-by: Paolo Abeni 

Applied, thank you.

Re: [PATCHv3 net-next 0/8] net: sched: act: add extack support

2018-02-16 Thread David Miller

From: Alexander Aring 
Date: Thu, 15 Feb 2018 10:54:52 -0500

> this patch series adds extack support for the TC action subsystem.
> As example I for the extack support in a TC action I choosed mirred
> action.
> 
> - Alex
> 
> Cc: David Ahern 
> 
> changes since v3:
> - adapt recommended changes from Davide Caratti, please check if
>   I catch everything. Thanks.
> 
> changes since v2:
> 
> - remove newline in extack of generic walker handling
>   Thanks to Davide Caratti
> - add ker...@mojatatu.com in cc

Series applied, thanks.

Re: [PATCH RFC 0/4] net: add bpfilter

2018-02-16 Thread Daniel Borkmann

Hi Florian,

On 02/16/2018 05:14 PM, Florian Westphal wrote:
> Florian Westphal  wrote:
>> Daniel Borkmann  wrote:
>> Several questions spinning at the moment, I will probably come up with
>> more:
> 
> ... and here there are some more ...
> 
> One of the many pain points of xtables design is the assumption of 'used
> only by sysadmin'.
> 
> This has not been true for a very long time, so by now iptables has
> this userspace lock (yes, its fugly workaround) to serialize concurrent
> iptables invocations in userspace.
> 
> AFAIU the translate-in-userspace design now brings back the old problem
> of different tools overwriting each others iptables rules.

Right, so the behavior would need to be adapted to be exactly the same,
given all the requests go into kernel space first via the usual uapis,
I don't think there would be anything in the way of keeping that as is.

> Another question -- am i correct in that each rule manipulation would
> incur a 'recompilation'?  Or are there different mini programs chained
> together?

Right now in the PoC yes, basically it regenerates the program on the fly
in gen.c when walking the struct bpfilter_ipt_ip's and appends the entries
to the program, but it doesn't have to be that way. There are multiple
options to allow for a partial code generation, e.g. via chaining tail
call arrays or directly via BPF to BPF calls eventually, there would be
few changes on BPF side needed, but it can be done; there could additionally
be various optimizations passes during code generation phase performed
while keeping given constraints in order to speed up getting to a verdict.

> One of the nftables advantages is that (since rule representation in
> kernel is black-box from userspace point of view) is that the kernel
> can announce add/delete of rules or elements from nftables sets.
> 
> Any particular reason why translating iptables rather than nftables
> (it should be possible to monitor the nftables changes that are
>  announced by kernel and act on those)?

Yeah, correct, this should be possible as well. We started out with the
iptables part in the demo as the majority of bigger infrastructure projects
all still rely heavily on it (e.g. docker, k8s to just name two big ones).
Usually they have their requests to iptables baked into their code directly
which probably won't change any time soon, so thought was that they could
benefit initially from it once there would be sufficient coverage.

Thanks,
Daniel

Re: [PATCH net] net: sched: fix unbalance in the error path of tca_action_flush()

2018-02-16 Thread David Miller

From: Davide Caratti 
Date: Thu, 15 Feb 2018 15:50:57 +0100

> When tca_action_flush() calls the action walk() and gets an error,
> a successful call to nla_nest_start() is not followed by a call to
> nla_nest_cancel(). It's harmless, as the skb is freed in the error
> path - but it's worth to fix this unbalance.
> 
> Signed-off-by: Davide Caratti 

Applied to net-next, thanks.

Re: [PATCH] PCI/cxgb4: Extend T3 PCI quirk to T4+ devices

2018-02-16 Thread David Miller

From: Ganesh Goudar 
Date: Thu, 15 Feb 2018 20:03:18 +0530

> From: Casey Leedom 
> 
> We've run into a problem where our device is attached
> to a Virtual Machine and the use of the new pci_set_vpd_size()
> API doesn't help.  The VM kernel has been informed that
> the accesses are okay, but all of the actual VPD Capability
> Accesses are trapped down into the KVM Hypervisor where it
> goes ahead and imposes the silent denials.
> 
> The right idea is to follow the kernel.org
> commit 1c7de2b4ff88 ("PCI: Enable access to non-standard VPD for
> Chelsio devices (cxgb3)") which Alexey Kardashevskiy authored
> to establish a PCI Quirk for our T3-based adapters. This commit
> extends that PCI Quirk to cover Chelsio T4 devices and later.
> 
> The advantage of this approach is that the VPD Size gets set early
> in the Base OS/Hypervisor Boot and doesn't require that the cxgb4
> driver even be available in the Base OS/Hypervisor.  Thus PF4 can
> be exported to a Virtual Machine and everything should work.
> 
> Fixes: 67e658794ca1 ("cxgb4: Set VPD size so we can read both VPD structures")
> Cc:   # v4.9+
> Signed-off-by: Casey Leedom 
> Signed-off-by: Arjun Vynipadath 
> Signed-off-by: Ganesh Goudar 

Applied, thanks.

Re: [PATCH 0/3] Remove IPVlan module dependencies on IPv6 and Netfilter

2018-02-16 Thread David Miller

From: Matteo Croce 
Date: Thu, 15 Feb 2018 15:04:52 +0100

> What about the other two, removing IPv6 and change the Kconfig?
> Other devices like VXLan, Geneve and VRF uses the same architecture
> to allow conditional compilation of the IPv6 module, I think that
> IPVlan should do the same.

Ok, those parts should be reasonable, please respin.

Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic

2018-02-16 Thread David Miller

From: Rahul Lakkireddy 
Date: Thu, 15 Feb 2018 19:24:42 +0530

> Register callback to panic_notifier_list.  Invoke dump collect routine
> to append dump to vmcore.
> 
> Signed-off-by: Rahul Lakkireddy 
> Signed-off-by: Ganesh Goudar 

There is absolutely no precedence for a networking driver dumping
things into the vmcore image on a panic.

And I don't think this is a good idea.

Really, this commit message should have explained why this is desired
and in what context it is legitimate for this driver in particular
to do it.

A very detailed, long, complete commit message is especially important
when you are deciding to blaze you own trail and do something no other
networking driver has done before.

I get really upset when I see changes like this, because you give me
no preparation for what I'm about to read in the patch and therefore
I have to go into this routine asking you to explain things properly.

But as-is, I see this panic notifier as a really bad idea.

Thanks.

Re: [PATCH net-next 0/2] net: dsa: mv88e6xxx: Improve PTP access latency

2018-02-16 Thread David Miller

From: Andrew Lunn 
Date: Thu, 15 Feb 2018 14:38:33 +0100

> PTP needs to retrieve the hardware timestamps from the switch device
> in a low latency manor. However ethtool -S and bridge fdb show can
> hold the switch register access mutex for a long time. These patches
> changes the reading the statistics and the ATU so that the mutex is
> released and taken again between each statistic or ATU entry. The PTP
> code can then interleave its access to the hardware, keeping its
> latency low.

Series applied, thanks Andrew.

Re: [net-next v2 1/1] tipc: avoid unnecessary copying of bundled messages

2018-02-16 Thread David Miller

From: Jon Maloy 
Date: Thu, 15 Feb 2018 14:14:37 +0100

> A received sk buffer may contain dozens of smaller 'bundled' messages
> which after extraction go each in their own direction.
> 
> Unfortunately, when we extract those messages using skb_clone() each
> of the extracted buffers inherit the truesize value of the original
> buffer. Apart from causing massive overaccounting of the base buffer's
> memory, this often causes tipc_msg_validate() to come to the false
> conclusion that the ratio truesize/datasize > 4, and perform an
> unnecessary copying of the extracted buffer.
> 
> We now fix this problem by explicitly correcting the truesize value of
> the buffer clones to be the truesize of the clone itself plus a
> calculated fraction of the base buffer's overhead. This change
> eliminates the overaccounting and at least mitigates the occurrence
> of unnecessary buffer copying.
> 
> Reported-by: Hoang Le 
> Acked-by: Ying Xue 
> Signed-off-by: Jon Maloy 

As I explained in my previous two emails, I don't think this method of
accounting is appropriate.

All of your clones must use the same skb->truesize as the original SKB
because each and every one of them keeps the full buffer from being
liberated until they are released.

Re: [PATCH net] cxgb4: fix trailing zero in CIM LA dump

2018-02-16 Thread David Miller

From: Rahul Lakkireddy 
Date: Thu, 15 Feb 2018 18:20:01 +0530

> Set correct size of the CIM LA dump for T6.
> 
> Fixes: 27887bc7cb7f ("cxgb4: collect hardware LA dumps")
> Signed-off-by: Rahul Lakkireddy 
> Signed-off-by: Ganesh Goudar 

Applied and queued up for -stable, thanks.

Re: [PATCH net] cxgb4: free up resources of pf 0-3

2018-02-16 Thread David Miller

From: Ganesh Goudar 
Date: Thu, 15 Feb 2018 18:16:57 +0530

> free pf 0-3 resources, commit baf5086840ab ("cxgb4:
> restructure VF mgmt code") erroneously removed the
> code which frees the pf 0-3 resources, causing the
> probe of pf 0-3 to fail in case of driver reload.
> 
> Fixes: baf5086840ab ("cxgb4: restructure VF mgmt code")
> Signed-off-by: Ganesh Goudar 

Series applied, thank you.

Re: [net-next 00/10] tipc: de-generealize topology server

2018-02-16 Thread David Miller

From: Jon Maloy 
Date: Thu, 15 Feb 2018 10:40:41 +0100

> The topology server is partially based on a template that is much
> more generic than what we need. This results in a code that is
> unnecessarily hard to follow and keeping bug free.
> 
> We now take the consequence of the fact that we only have one such
> server in TIPC, - with no prospects for introducing any more, and
> adapt the code to the specialized task is really is doing.

Nice cleanup, series applied, thanks Jon.

Re: [net-next 1/1] tipc: avoid unnecessary copying of bundled messages

2018-02-16 Thread David Miller

From: Jon Maloy 
Date: Thu, 15 Feb 2018 08:57:14 +

> The buffers we are cloning are linearized 1 MTU incoming
> buffers. There are no fragments.  Each clone normally points to only
> a tiny fraction of the data area of the base buffer.  I don't claim
> that copying always is bad, but in this case it happens in the
> majority of cases, and as I see it completely unnecessarily.
> 
> There is actually some under accounting, however, since we now only
> count the data area of the base buffer (== the sum of the data area
> of the clones) plus the overhead of the clones.  A more accurate
> calculation, taking into account even the overhead of the base
> buffer, would look like this: (*iskb)->truesize = SKB_TRUSIZE(imsz)
> + (skb->truesize - skb->len) / msg_msgcnt(msg);
> 
> I.e., we calculate the overhead of the base buffer and divide it
> equally among the clones.  Now I really can't see we are missing
> anything.

Maybe there is some miscommunication between us.  Let me try using
different words.

I you have a single SKB which has a truesize of, say, 10K and then you
clone this several times to point the SKB into small windows into that
original SKB's buffer, then you must use the same 10K truesize for the
clones that you did for the original SKB.

It absolutely does not matter that the clones are only pointing to a
small part of the original SKB's buffer.  Until they are freed up each
individual SKB still causes that _ENTIRE_ buffer to be held up and not
freeable.

This is why you have to be careful how you manage SKBs, and in fact
what you are doing by cloning and constricting the data pointers, is
actually troublesome and you can see this with the truesize problems
you are running into.

SKBs are really not built to be managed like this.

Re: [PATCH net v2] fib_semantics: Don't match route with mismatching tclassid

2018-02-16 Thread David Miller

From: Stefano Brivio 
Date: Thu, 15 Feb 2018 09:46:03 +0100

> In fib_nh_match(), if output interface or gateway are passed in
> the FIB configuration, we don't have to check next hops of
> multipath routes to conclude whether we have a match or not.
> 
> However, we might still have routes with different realms
> matching the same output interface and gateway configuration,
> and this needs to cause the match to fail. Otherwise the first
> route inserted in the FIB will match, regardless of the realms:
> 
>  # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2
>  # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4
>  # ip route list table 1234
>  1.1.1.1 dev eth0 scope link realms 1/2
>  1.1.1.1 dev eth0 scope link realms 3/4
>  # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4
>  # ip route list table 1234
>  1.1.1.1 dev ens3 scope link realms 3/4
> 
> whereas route with realms 3/4 should have been deleted instead.
> 
> Explicitly check for fc_flow passed in the FIB configuration
> (this comes from RTA_FLOW extracted by rtm_to_fib_config()) and
> fail matching if it differs from nh_tclassid.
> 
> The handling of RTA_FLOW for multipath routes later in
> fib_nh_match() is still needed, as we can have multiple RTA_FLOW
> attributes that need to be matched against the tclassid of each
> next hop.
> 
> v2: Check that fc_flow is set before discarding the match, so
> that the user can still select the first matching rule by
> not specifying any realm, as suggested by David Ahern.
> 
> Reported-by: Jianlin Shi 
> Signed-off-by: Stefano Brivio 

Applied and queued up for -stable, thanks.

Re: [PATCH V6 2/4] sctp: Add ip option support

2018-02-16 Thread Neil Horman

On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote:
> On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote:
> > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote:
> > > Add ip option support to allow LSM security modules to utilise CIPSO/IPv4
> > > and CALIPSO/IPv6 services.
> > > 
> > > Signed-off-by: Richard Haines 
> > > ---
> > >  include/net/sctp/sctp.h|  4 +++-
> > >  include/net/sctp/structs.h |  2 ++
> > >  net/sctp/chunk.c   | 12 +++-
> > >  net/sctp/ipv6.c| 42 
> > > +++---
> > >  net/sctp/output.c  |  5 -
> > >  net/sctp/protocol.c| 36 
> > >  net/sctp/socket.c  | 14 ++
> > >  7 files changed, 97 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > > index f7ae6b0..25c5c87 100644
> > > --- a/include/net/sctp/sctp.h
> > > +++ b/include/net/sctp/sctp.h
> > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct 
> > > list_head *head)
> > >  static inline int sctp_frag_point(const struct sctp_association *asoc, 
> > > int pmtu)
> > >  {
> > >   struct sctp_sock *sp = sctp_sk(asoc->base.sk);
> > > + struct sctp_af *af = sp->pf->af;
> > >   int frag = pmtu;
> > >  
> > > - frag -= sp->pf->af->net_header_len;
> > > + frag -= af->ip_options_len(asoc->base.sk);
> > > + frag -= af->net_header_len;
> > >   frag -= sizeof(struct sctphdr) + sctp_datachk_len(>stream);
> > >  
> > >   if (asoc->user_frag)
> > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> > > index 03e92dd..ead5fce 100644
> > > --- a/include/net/sctp/structs.h
> > > +++ b/include/net/sctp/structs.h
> > > @@ -491,6 +491,7 @@ struct sctp_af {
> > >   void(*ecn_capable)(struct sock *sk);
> > >   __u16   net_header_len;
> > >   int sockaddr_len;
> > > + int (*ip_options_len)(struct sock *sk);
> > >   sa_family_t sa_family;
> > >   struct list_head list;
> > >  };
> > > @@ -515,6 +516,7 @@ struct sctp_pf {
> > >   int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr);
> > >   void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
> > >   void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
> > > + void (*copy_ip_options)(struct sock *sk, struct sock *newsk);
> > >   struct sctp_af *af;
> > >  };
> > >  
> > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > > index 991a530..d5c0ef7 100644
> > > --- a/net/sctp/chunk.c
> > > +++ b/net/sctp/chunk.c
> > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct sctp_datamsg 
> > > *msg, struct sctp_chunk *chu
> > >   chunk->msg = msg;
> > >  }
> > >  
> > > -
> > >  /* A data chunk can have a maximum payload of (2^16 - 20).  Break
> > >   * down any such message into smaller chunks.  Opportunistically, 
> > > fragment
> > >   * the chunks down to the current MTU constraints.  We may get 
> > > refragmented
> > > @@ -171,6 +170,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > sctp_association *asoc,
> > >   struct list_head *pos, *temp;
> > >   struct sctp_chunk *chunk;
> > >   struct sctp_datamsg *msg;
> > > + struct sctp_sock *sp;
> > > + struct sctp_af *af;
> > >   int err;
> > >  
> > >   msg = sctp_datamsg_new(GFP_KERNEL);
> > > @@ -189,9 +190,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > sctp_association *asoc,
> > >   /* This is the biggest possible DATA chunk that can fit into
> > >* the packet
> > >*/
> > > - max_data = asoc->pathmtu -
> > > -sctp_sk(asoc->base.sk)->pf->af->net_header_len -
> > > -sizeof(struct sctphdr) - sctp_datachk_len(>stream);
> > > + sp = sctp_sk(asoc->base.sk);
> > > + af = sp->pf->af;
> > > + max_data = asoc->pathmtu - af->net_header_len -
> > > +sizeof(struct sctphdr) - sctp_datachk_len(>stream) -
> > > +af->ip_options_len(asoc->base.sk);
> > >   max_data = SCTP_TRUNC4(max_data);
> > >  
> > >   /* If the the peer requested that we authenticate DATA chunks
> > > @@ -211,7 +214,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > sctp_association *asoc,
> > >  
> > >   /* Set first_len and then account for possible bundles on first frag */
> > >   first_len = max_data;
> > > -
> > >   /* Check to see if we have a pending SACK and try to let it be bundled
> > >* with this message.  Do this if we don't have any data queued already.
> > >* To check that, look at out_qlen and retransmit list.
> > > diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> > > index e35d4f7..0b0f895 100644
> > > --- a/net/sctp/ipv6.c
> > > +++ b/net/sctp/ipv6.c
> > > @@ -427,6 +427,38 @@ static void sctp_v6_copy_addrlist(struct list_head 
> > > *addrlist,
> > >   rcu_read_unlock();
> > >  }
> > >  
> > > +/* Copy over any ip options */
> > > +static void sctp_v6_copy_ip_options(struct sock *sk, struct sock

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi.

On pátek 16. února 2018 18:56:12 CET Holger Hoffstätte wrote:
> There is simply no reason why you shouldn't get approx. line rate
> (~920+-ish) Mbit over wired 1GBit Ethernet; even my broken 10-year old
> Core2Duo laptop can do that. Can you boot with spectre_v2=off and try "the
> simplest case" with the defaults cubic/pfifo_fast? spectre_v2 has terrible
> performance impact esp. on small/older processors.

Just have tried. No visible difference.

> When I last benchmarked full PREEMPT with 4.9.x it was similarly bad and
> also had a noticeable network throughput impact even on my i7.
> 
> Also congratulations for being the only other person I know who ever tried
> YeAH. :-)

Well, according to the git log on tcp_yeah.c and Reported-by tag, I was not 
the only one there ;).

Regards,
  Oleksandr

Re: [PATCH v2] ravb: add support for changing MTU

2018-02-16 Thread Florian Fainelli

On 02/16/2018 11:43 AM, Sergei Shtylyov wrote:
> Hello!
> 
> On 02/16/2018 10:42 PM, Florian Fainelli wrote:
> 
>>> Allow for changing the MTU within the limit of the maximum size of a
>>> descriptor (2048 bytes). Add the callback to change MTU from user-space
>>> and take the configurable MTU into account when configuring the
>>> hardware.
>>>
>>> Signed-off-by: Niklas Söderlund 
>>> ---
>>
>>>  
>>> +static int ravb_change_mtu(struct net_device *ndev, int new_mtu)
>>> +{
>>> +   if (netif_running(ndev))
>>> +   return -EBUSY;
>>> +
>>> +   ndev->mtu = new_mtu;
>>> +   netdev_update_features(ndev);
>>
>> Don't you somehow need to quiesce the RX DMA and make sure you that you
>> re-allocate all RX buffers within priv->rx_skb[q][entry] such that they
>> will be able to accept a larger buffer size?
>>
>> If we put the ravb interface under high RX load and we change the MTU on
>> the fly, can we crash the kernel?
> 
>I thought the netif_running() check guards against that. Does it not?

It does, completely missed that, thanks!
-- 
Florian

Re: [PATCH v2] ravb: add support for changing MTU

2018-02-16 Thread Sergei Shtylyov

Hello!

On 02/16/2018 10:42 PM, Florian Fainelli wrote:

>> Allow for changing the MTU within the limit of the maximum size of a
>> descriptor (2048 bytes). Add the callback to change MTU from user-space
>> and take the configurable MTU into account when configuring the
>> hardware.
>>
>> Signed-off-by: Niklas Söderlund 
>> ---
> 
>>  
>> +static int ravb_change_mtu(struct net_device *ndev, int new_mtu)
>> +{
>> +if (netif_running(ndev))
>> +return -EBUSY;
>> +
>> +ndev->mtu = new_mtu;
>> +netdev_update_features(ndev);
> 
> Don't you somehow need to quiesce the RX DMA and make sure you that you
> re-allocate all RX buffers within priv->rx_skb[q][entry] such that they
> will be able to accept a larger buffer size?
> 
> If we put the ravb interface under high RX load and we change the MTU on
> the fly, can we crash the kernel?

   I thought the netif_running() check guards against that. Does it not?
 
MBR, Sergei

Re: [PATCH v2] ravb: add support for changing MTU

2018-02-16 Thread Florian Fainelli

On 02/16/2018 08:10 AM, Niklas Söderlund wrote:
> Allow for changing the MTU within the limit of the maximum size of a
> descriptor (2048 bytes). Add the callback to change MTU from user-space
> and take the configurable MTU into account when configuring the
> hardware.
> 
> Signed-off-by: Niklas Söderlund 
> ---

>  
> +static int ravb_change_mtu(struct net_device *ndev, int new_mtu)
> +{
> + if (netif_running(ndev))
> + return -EBUSY;
> +
> + ndev->mtu = new_mtu;
> + netdev_update_features(ndev);

Don't you somehow need to quiesce the RX DMA and make sure you that you
re-allocate all RX buffers within priv->rx_skb[q][entry] such that they
will be able to accept a larger buffer size?

If we put the ravb interface under high RX load and we change the MTU on
the fly, can we crash the kernel?
-- 
Florian

[PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0

2018-02-16 Thread David Ahern

Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings
if the index is actually set in the message.

Signed-off-by: David Ahern 
---
 net/ipv4/ip_sockglue.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 008be04ac1cc..9dca0fb8c482 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
if (!ipv6_addr_v4mapped(_info->ipi6_addr))
return -EINVAL;
-   ipc->oif = src_info->ipi6_ifindex;
+   if (src_info->ipi6_ifindex)
+   ipc->oif = src_info->ipi6_ifindex;
ipc->addr = src_info->ipi6_addr.s6_addr32[3];
continue;
}
@@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct 
in_pktinfo)))
return -EINVAL;
info = (struct in_pktinfo *)CMSG_DATA(cmsg);
-   ipc->oif = info->ipi_ifindex;
+   if (info->ipi_ifindex)
+   ipc->oif = info->ipi_ifindex;
ipc->addr = info->ipi_spec_dst.s_addr;
break;
}
-- 
2.11.0

Re: ppp/pppoe, still panic 4.15.3 in ppp_push

2018-02-16 Thread Guillaume Nault

On Fri, Feb 16, 2018 at 01:13:18PM +0200, Denys Fedoryshchenko wrote:
> On 2018-02-15 21:42, Guillaume Nault wrote:
> > On Thu, Feb 15, 2018 at 09:34:42PM +0200, Denys Fedoryshchenko wrote:
> > > On 2018-02-15 21:31, Guillaume Nault wrote:
> > > > On Thu, Feb 15, 2018 at 06:01:16PM +0200, Denys Fedoryshchenko wrote:
> > > > > On 2018-02-15 17:55, Guillaume Nault wrote:
> > > > > > On Thu, Feb 15, 2018 at 12:19:52PM +0200, Denys Fedoryshchenko 
> > > > > > wrote:
> > > > > > > Here we go:
> > > > > > >
> > > > > > >   [24558.921549]
> > > > > > > ==
> > > > > > >   [24558.922167] BUG: KASAN: use-after-free in
> > > > > > > ppp_ioctl+0xa6a/0x1522
> > > > > > > [ppp_generic]
> > > > > > >   [24558.922776] Write of size 8 at addr 8803d35bf3f8 by 
> > > > > > > task
> > > > > > > accel-pppd/12622
> > > > > > >   [24558.923113]
> > > > > > >   [24558.923451] CPU: 0 PID: 12622 Comm: accel-pppd Tainted: 
> > > > > > > G
> > > > > > > W
> > > > > > > 4.15.3-build-0134 #1
> > > > > > >   [24558.924058] Hardware name: HP ProLiant DL320e Gen8 v2,
> > > > > > > BIOS P80
> > > > > > > 04/02/2015
> > > > > > >   [24558.924406] Call Trace:
> > > > > > >   [24558.924753]  dump_stack+0x46/0x59
> > > > > > >   [24558.925103]  print_address_description+0x6b/0x23b
> > > > > > >   [24558.925451]  ? ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > > > > > >   [24558.925797]  kasan_report+0x21b/0x241
> > > > > > >   [24558.926136]  ppp_ioctl+0xa6a/0x1522 [ppp_generic]
> > > > > > >   [24558.926479]  ? ppp_nl_newlink+0x1da/0x1da [ppp_generic]
> > > > > > >   [24558.926829]  ? sock_sendmsg+0x89/0x99
> > > > > > >   [24558.927176]  ? __vfs_write+0xd9/0x4ad
> > > > > > >   [24558.927523]  ? kernel_read+0xed/0xed
> > > > > > >   [24558.927872]  ? SyS_getpeername+0x18c/0x18c
> > > > > > >   [24558.928213]  ? bit_waitqueue+0x2a/0x2a
> > > > > > >   [24558.928561]  ? wake_atomic_t_function+0x115/0x115
> > > > > > >   [24558.928898]  vfs_ioctl+0x6e/0x81
> > > > > > >   [24558.929228]  do_vfs_ioctl+0xa00/0xb10
> > > > > > >   [24558.929571]  ? sigprocmask+0x1a6/0x1d0
> > > > > > >   [24558.929907]  ? sigsuspend+0x13e/0x13e
> > > > > > >   [24558.930239]  ? ioctl_preallocate+0x14e/0x14e
> > > > > > >   [24558.930568]  ? SyS_rt_sigprocmask+0xf1/0x142
> > > > > > >   [24558.930904]  ? sigprocmask+0x1d0/0x1d0
> > > > > > >   [24558.931252]  SyS_ioctl+0x39/0x55
> > > > > > >   [24558.931595]  ? do_vfs_ioctl+0xb10/0xb10
> > > > > > >   [24558.931942]  do_syscall_64+0x1b1/0x31f
> > > > > > >   [24558.932288]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > > > >   [24558.932627] RIP: 0033:0x7f302849d8a7
> > > > > > >   [24558.932965] RSP: 002b:7f3029a52af8 EFLAGS: 0206
> > > > > > > ORIG_RAX:
> > > > > > > 0010
> > > > > > >   [24558.933578] RAX: ffda RBX: 7f3027d861e3 
> > > > > > > RCX:
> > > > > > > 7f302849d8a7
> > > > > > >   [24558.933927] RDX: 7f3023f49468 RSI: 4004743a 
> > > > > > > RDI:
> > > > > > > 3a67
> > > > > > >   [24558.934266] RBP: 7f3029a52b20 R08:  
> > > > > > > R09:
> > > > > > > 55c8308d8e40
> > > > > > >   [24558.934607] R10: 0008 R11: 0206 
> > > > > > > R12:
> > > > > > > 7f3023f49358
> > > > > > >   [24558.934947] R13: 7ffe86e5723f R14:  
> > > > > > > R15:
> > > > > > > 7f3029a53700
> > > > > > >   [24558.935288]
> > > > > > >   [24558.935626] Allocated by task 12622:
> > > > > > >   [24558.935972]  ppp_register_net_channel+0x5f/0x5c6
> > > > > > > [ppp_generic]
> > > > > > >   [24558.936306]  pppoe_connect+0xab7/0xc71 [pppoe]
> > > > > > >   [24558.936640]  SyS_connect+0x14b/0x1b7
> > > > > > >   [24558.936975]  do_syscall_64+0x1b1/0x31f
> > > > > > >   [24558.937319]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > > > >   [24558.937655]
> > > > > > >   [24558.937993] Freed by task 12622:
> > > > > > >   [24558.938321]  kfree+0xb0/0x11d
> > > > > > >   [24558.938658]  ppp_release+0x111/0x120 [ppp_generic]
> > > > > > >   [24558.938994]  __fput+0x2ba/0x51a
> > > > > > >   [24558.939332]  task_work_run+0x11c/0x13d
> > > > > > >   [24558.939676]  exit_to_usermode_loop+0x7c/0xaf
> > > > > > >   [24558.940022]  do_syscall_64+0x2ea/0x31f
> > > > > > >   [24558.940368]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > > > >   [24558.947099]
> > > > > >
> > > > > > Your first guess was right. It looks like we have an issue with
> > > > > > reference counting on the channels. Can you send me your 
> > > > > > ppp_generic.o?
> > > > > http://nuclearcat.com/ppp_generic.o
> > > > > Compiled with gcc version 6.4.0 (Gentoo 6.4.0-r1 p1.3)
> > > > >
> > > > From what I can see, ppp_release() and ioctl(PPPIOCCONNECT) are called
> > > > concurrently on the same ppp_file. Even if this ppp_file was pointed at
> > > > by two different file descriptors, I can't see how this could defeat
> > > > the reference counting mechanism. I'm going

Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-16 Thread Sargun Dhillon

On Wed, Feb 14, 2018 at 8:30 PM, Alexei Starovoitov
 wrote:
> On Wed, Feb 14, 2018 at 10:32:22AM -0700, Tycho Andersen wrote:
>> > >
>> > > What's the reason for adding eBPF support? seccomp shouldn't need it,
>> > > and it only makes the code more complex. I'd rather stick with cBPF
>> > > until we have an overwhelmingly good reason to use eBPF as a "native"
>> > > seccomp filter language.
>> > >
>> >
>> > I can think of two fairly strong use cases for eBPF's ability to call
>> > functions: logging and Tycho's user notifier thing.
>>
>> Worth noting that there is one additional thing that I didn't
>> implement, but which would be nice and is probably not possible with
>> eBPF (at least, not without a bunch of additional infrastructure):
>> passing fds back to the tracee from the manager if you intercept
>> socket(), or accept() or something.
>>
>> This could again be accomplished via other means, though it would be a
>> lot nicer to have a primitive for it.
>
> there is bpf_perf_event_output() interface that allows to stream
> arbitrary data from kernel into user space via perf ring buffer.
> User space can epoll on it. We use this in both tracing and networking
> for notifications and streaming data transfers.
> I suspect this can be used for 'logging' too, since it's cheap and fast.
>
> Specifically for android we added bpf_lsm hooks, cookie/uid helpers,
> and read-only maps.
> Lorenzo,
> there was a claim in this thread that bpf is disabled on android.
> Can you please clarify ?
> If it's actually disabled and there is no intent to enable it,
> I'd rather not add any more android specific features to bpf.
>
> What I think is important to understand is that BPF goes through
> very active development. The verifier is constantly getting smarter.
> There is work to add bounded loops, lock/unlock, get/put tracking,
> global/percpu variables, dynamic linking and so on.
> Most of the features are available to root only and unpriv
> has very limited set. Like getting bpf_perf_event_output() to work
> for unpriv will likely require additional verifier work.
>
> So all cool bits will not be usable by seccomp+eBPF and unpriv
> on day one. It's not a lot of work either, but once it's done
> I'd hate to see arguments against adding more verifier features
> just because eBPF is used by seccomp/landlock/other_security_thing.
>
> Also I think the argument that seccomp+eBPF will be faster than
> seccomp+cBPF is a weak one. I bet kpti on/off makes no difference
> under seccomp, since _all_ syscalls are already slow for sandboxed app.
> Instead of making seccomp 5% faster with eBPF, I think it's
> worth looking into extending LSM hooks to cover all syscalls and
> have programmable (bpf or whatever) filtering applied per syscall.
> Like we can have a white list syscall table covered by lsm hooks
> and any other syscall will get into old seccomp-style
> filtering category automatically.
> lsm+bpf would need to follow process hierarchy. It shouldn't be
> a runtime check at syscall entry either, but compile time
> extra branch in SYSCALL_DEFINE for non-whitelisted syscalls.
> There are bunch of other things to figure out, but I think
> the perf win will be bigger than replacing cBPF with eBPF in
> existing seccomp.
>
Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs, and tail call, the
numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

This is using libseccomp for the standard seccomp BPF program. There's
no reasonable way for our workload to know which syscalls come
"earlier", so we can't take that optimization. Potentially, libseccomp
can be smarter about ordering cases (using ranges), and use an
O(log(n)) search algorithm, but both of these are microptimizations
that scale with the number of syscalls and per-syscall rules. The
nicety of using a PROG_ARRAY means that adding additional filters
(syscalls) comes at no cost, whereas there's a tradeoff any time you
add another rule in traditional seccomp filters.

This was tested on an Amazon M4.16XL running with pcid, and KPTI.

Re: [PATCH v3 2/2] net: ethernet: nixge: Add support for National Instruments XGE netdev

2018-02-16 Thread Andrew Lunn

On Fri, Feb 16, 2018 at 09:00:33AM -0800, Moritz Fischer wrote:
> +#define NIXGE_MDIO_CLAUSE45  BIT(12)
> +#define NIXGE_MDIO_CLAUSE22  0
> +#define NIXGE_MDIO_OP(n) (((n) & 0x3) << 10)
> +#define NIXGE_MDIO_OP_ADDRESS0
> +#define NIXGE_MDIO_OP_WRITE  BIT(0)
> +#define NIXGE_MDIO_OP_READ   (BIT(1) | BIT(0))
> +#define MDIO_C22_WRITE   BIT(0)
> +#define MDIO_C22_READBIT(1)
> +#define MDIO_READ_POST   2

Hi Moritz


NIXGE prefix?

> +#define NIXGE_MDIO_ADDR(n)   (((n) & 0x1f) << 5)
> +#define NIXGE_MDIO_MMD(n)(((n) & 0x1f) << 0)
> +
> +#define NIXGE_MAX_PHY_ADDR   32

This is nothing specific to this device. Please use the standard
PHY_MAX_ADDR.

> +static int nixge_open(struct net_device *ndev)
> +{
> + struct nixge_priv *priv = netdev_priv(ndev);
> + struct phy_device *phy;
> + int ret;
> +
> + nixge_device_reset(ndev);
> +
> + phy = of_phy_connect(ndev, priv->phy_node,
> +  _handle_link_change, 0, priv->phy_mode);
> + if (!phy)
> + return -ENODEV;
> +
> + phy_start(phy);
> +
> + /* Enable tasklets for Axi DMA error handling */
> + tasklet_init(>dma_err_tasklet, nixge_dma_err_handler,
> +  (unsigned long)priv);
> +
> + napi_enable(>napi);
> +
> + /* Enable interrupts for Axi DMA Tx */
> + ret = request_irq(priv->tx_irq, nixge_tx_irq, 0, ndev->name, ndev);
> + if (ret)
> + goto err_tx_irq;
> + /* Enable interrupts for Axi DMA Rx */
> + ret = request_irq(priv->rx_irq, nixge_rx_irq, 0, ndev->name, ndev);
> + if (ret)
> + goto err_rx_irq;
> +
> + netif_start_queue(ndev);
> +
> + return 0;
> +
> +err_rx_irq:
> + free_irq(priv->tx_irq, ndev);
> +err_tx_irq:
> + tasklet_kill(>dma_err_tasklet);
> + netdev_err(ndev, "request_irq() failed\n");
> + return ret;

Maybe you also want to stop the phy and disconnect it?

> +static int nixge_stop(struct net_device *ndev)
> +{
> + struct nixge_priv *priv = netdev_priv(ndev);
> + u32 cr;
> +
> + netif_stop_queue(ndev);
> + napi_disable(>napi);
> +
> + if (ndev->phydev)
> + phy_stop(ndev->phydev);

phy_disconnect() ?


> + cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET);
> + nixge_dma_write_reg(priv, XAXIDMA_RX_CR_OFFSET,
> + cr & (~XAXIDMA_CR_RUNSTOP_MASK));
> + cr = nixge_dma_read_reg(priv, XAXIDMA_TX_CR_OFFSET);
> + nixge_dma_write_reg(priv, XAXIDMA_TX_CR_OFFSET,
> + cr & (~XAXIDMA_CR_RUNSTOP_MASK));
> +
> + tasklet_kill(>dma_err_tasklet);
> +
> + free_irq(priv->tx_irq, ndev);
> + free_irq(priv->rx_irq, ndev);
> +
> + nixge_hw_dma_bd_release(ndev);
> +
> + return 0;
> +}

> +static int nixge_mdio_read(struct mii_bus *bus, int phy_id, int reg)
> +{
> + struct nixge_priv *priv = bus->priv;
> + u32 status, tmp;
> + int err;
> + u16 device;
> +
> + if (reg & MII_ADDR_C45) {
> + device = (reg >> 16) & 0x1f;
> +
> + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_ADDR, reg & 0x);
> +
> + tmp = NIXGE_MDIO_CLAUSE45 | NIXGE_MDIO_OP(NIXGE_MDIO_OP_ADDRESS)
> + | NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device);
> +
> + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_OP, tmp);
> + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_CTRL, 1);
> +
> + err = nixge_ctrl_poll_timeout(priv, NIXGE_REG_MDIO_CTRL, status,
> +   !status, 10, 1000);
> + if (err) {
> + dev_err(priv->dev, "timeout setting address");
> + return err;
> + }
> +
> + tmp = NIXGE_MDIO_CLAUSE45 | NIXGE_MDIO_OP(NIXGE_MDIO_OP_READ) |
> + NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device);
> + } else {
> + device = reg & 0x1f;
> +
> + tmp = NIXGE_MDIO_CLAUSE22 | NIXGE_MDIO_OP(MDIO_C22_READ) |
> + NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device);

It would be nice to have some naming consistency here. NIXGE_MDIO_C45_READ and 
NIXGE_MDIO_C22_READ?

> + }
> +
> + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_OP, tmp);
> + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_CTRL, 1);
> +
> + err = nixge_ctrl_poll_timeout(priv, NIXGE_REG_MDIO_CTRL, status,
> +   !status, 10, 1000);
> + if (err) {
> + dev_err(priv->dev, "timeout setting read command");
> + return err;
> + }
> +
> + status = nixge_ctrl_read_reg(priv, NIXGE_REG_MDIO_DATA);
> +
> + return status;
> +}
> +
> +static int nixge_mdio_setup(struct nixge_priv *priv, struct device_node *np)
> +{
> + struct mii_bus *bus;
> + int err;
> +
> + bus = mdiobus_alloc();

devm_mdiobus_alloc() ?

> + if (!bus)
> + return -ENOMEM;
> +
> + snprintf(bus->id, MII_BUS_ID_SIZE,

[RFC PATCH v3 3/3] virtio_net: Enable alternate datapath without creating an additional netdev

2018-02-16 Thread Sridhar Samudrala

This patch addresses the issues that were seen with the 3 netdev model by
avoiding the creation of an additional netdev. Instead the bypass state
information is tracked in the original netdev and a different set of
ndo_ops and ethtool_ops are used when BACKUP feature is enabled.

Signed-off-by: Sridhar Samudrala 
Reviewed-by: Alexander Duyck  
---
 drivers/net/virtio_net.c | 283 +--
 1 file changed, 101 insertions(+), 182 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 14679806c1b1..c85b2949f151 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -154,7 +154,7 @@ struct virtnet_bypass_info {
struct net_device __rcu *active_netdev;
 
/* virtio_net netdev */
-   struct net_device __rcu *backup_netdev;
+   struct net_device *backup_netdev;
 
/* active netdev stats */
struct rtnl_link_stats64 active_stats;
@@ -229,8 +229,8 @@ struct virtnet_info {
 
unsigned long guest_offloads;
 
-   /* upper netdev created when BACKUP feature enabled */
-   struct net_device *bypass_netdev;
+   /* bypass state maintained when BACKUP feature is enabled */
+   struct virtnet_bypass_info *vbi;
 };
 
 struct padded_vnet_hdr {
@@ -2285,6 +2285,22 @@ static bool virtnet_bypass_xmit_ready(struct net_device 
*dev)
return netif_running(dev) && netif_carrier_ok(dev);
 }
 
+static bool virtnet_bypass_active_ready(struct net_device *dev)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtnet_bypass_info *vbi = vi->vbi;
+   struct net_device *active;
+
+   if (!vbi)
+   return false;
+
+   active = rcu_dereference(vbi->active_netdev);
+   if (!active || !virtnet_bypass_xmit_ready(active))
+   return false;
+
+   return true;
+}
+
 static void virtnet_config_changed_work(struct work_struct *work)
 {
struct virtnet_info *vi =
@@ -2312,7 +2328,7 @@ static void virtnet_config_changed_work(struct 
work_struct *work)
virtnet_update_settings(vi);
netif_carrier_on(vi->dev);
netif_tx_wake_all_queues(vi->dev);
-   } else {
+   } else if (!virtnet_bypass_active_ready(vi->dev)) {
netif_carrier_off(vi->dev);
netif_tx_stop_all_queues(vi->dev);
}
@@ -2501,7 +2517,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 
if (vi->has_cvq) {
vi->cvq = vqs[total_vqs - 1];
-   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
+   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN) &&
+   !virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
}
 
@@ -2690,62 +2707,54 @@ virtnet_bypass_child_open(struct net_device *dev,
 
 static int virtnet_bypass_open(struct net_device *dev)
 {
-   struct virtnet_bypass_info *vbi = netdev_priv(dev);
+   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
-
-   netif_carrier_off(dev);
-   netif_tx_wake_all_queues(dev);
+   int err;
 
child_netdev = rtnl_dereference(vbi->active_netdev);
if (child_netdev)
virtnet_bypass_child_open(dev, child_netdev);
 
-   child_netdev = rtnl_dereference(vbi->backup_netdev);
-   if (child_netdev)
-   virtnet_bypass_child_open(dev, child_netdev);
+   err = virtnet_open(dev);
+   if (err < 0) {
+   dev_close(child_netdev);
+   return err;
+   }
 
return 0;
 }
 
 static int virtnet_bypass_close(struct net_device *dev)
 {
-   struct virtnet_bypass_info *vi = netdev_priv(dev);
+   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
 
-   netif_tx_disable(dev);
+   virtnet_close(dev);
 
-   child_netdev = rtnl_dereference(vi->active_netdev);
-   if (child_netdev)
-   dev_close(child_netdev);
+   if (!vbi)
+   goto done;
 
-   child_netdev = rtnl_dereference(vi->backup_netdev);
+   child_netdev = rtnl_dereference(vbi->active_netdev);
if (child_netdev)
dev_close(child_netdev);
 
+done:
return 0;
 }
 
-static netdev_tx_t
-virtnet_bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-   atomic_long_inc(>tx_dropped);
-   dev_kfree_skb_any(skb);
-   return NETDEV_TX_OK;
-}
-
 static netdev_tx_t
 virtnet_bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-   struct virtnet_bypass_info *vbi = netdev_priv(dev);
+   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device

[RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available

2018-02-16 Thread Sridhar Samudrala

This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to enable only one datapath at any time so that
packets don't get looped back to the VM over the other datapath. When a VF
is plugged, the virtio datapath link state can be marked as down. The
hypervisor needs to unplug the VF device from the guest on the source host
and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

When BACKUP feature is enabled, an additional netdev(bypass netdev) is
created that acts as a master device and tracks the state of the 2 lower
netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
passthru device with the same MAC is registered as 'active' netdev.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization=151189725224231=2

Signed-off-by: Sridhar Samudrala 
Signed-off-by: Alexander Duyck  
---
 drivers/net/virtio_net.c | 639 ++-
 1 file changed, 638 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bcd13fe906ca..14679806c1b1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -147,6 +148,27 @@ struct receive_queue {
struct xdp_rxq_info xdp_rxq;
 };
 
+/* bypass state maintained when BACKUP feature is enabled */
+struct virtnet_bypass_info {
+   /* passthru netdev with same MAC */
+   struct net_device __rcu *active_netdev;
+
+   /* virtio_net netdev */
+   struct net_device __rcu *backup_netdev;
+
+   /* active netdev stats */
+   struct rtnl_link_stats64 active_stats;
+
+   /* backup netdev stats */
+   struct rtnl_link_stats64 backup_stats;
+
+   /* aggregated stats */
+   struct rtnl_link_stats64 bypass_stats;
+
+   /* spinlock while updating stats */
+   spinlock_t stats_lock;
+};
+
 struct virtnet_info {
struct virtio_device *vdev;
struct virtqueue *cvq;
@@ -206,6 +228,9 @@ struct virtnet_info {
u32 speed;
 
unsigned long guest_offloads;
+
+   /* upper netdev created when BACKUP feature enabled */
+   struct net_device *bypass_netdev;
 };
 
 struct padded_vnet_hdr {
@@ -2255,6 +2280,11 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_features_check = passthru_features_check,
 };
 
+static bool virtnet_bypass_xmit_ready(struct net_device *dev)
+{
+   return netif_running(dev) && netif_carrier_ok(dev);
+}
+
 static void virtnet_config_changed_work(struct work_struct *work)
 {
struct virtnet_info *vi =
@@ -2647,6 +2677,601 @@ static int virtnet_validate(struct virtio_device *vdev)
return 0;
 }
 
+static void
+virtnet_bypass_child_open(struct net_device *dev,
+ struct net_device *child_netdev)
+{
+   int err = dev_open(child_netdev);
+
+   if (err)
+   netdev_warn(dev, "unable to open slave: %s: %d\n",
+   child_netdev->name, err);
+}
+
+static int virtnet_bypass_open(struct net_device *dev)
+{
+   struct virtnet_bypass_info *vbi = netdev_priv(dev);
+   struct net_device *child_netdev;
+
+   netif_carrier_off(dev);
+   netif_tx_wake_all_queues(dev);
+
+   child_netdev = rtnl_dereference(vbi->active_netdev);
+   if (child_netdev)
+   virtnet_bypass_child_open(dev, child_netdev);
+
+   child_netdev = rtnl_dereference(vbi->backup_netdev);
+   if (child_netdev)
+   virtnet_bypass_child_open(dev, child_netdev);
+
+   return 0;
+}
+
+static int virtnet_bypass_close(struct net_device *dev)
+{
+   struct virtnet_bypass_info *vi = netdev_priv(dev);
+   struct net_device *child_netdev;
+
+   netif_tx_disable(dev);
+
+   child_netdev = rtnl_dereference(vi->active_netdev);
+   if (child_netdev)
+   dev_close(child_netdev);
+
+   child_netdev = rtnl_dereference(vi->backup_netdev);
+   if (child_netdev)
+   dev_close(child_netdev);
+
+   return 0;
+}
+
+static netdev_tx_t
+virtnet_bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+   atomic_long_inc(>tx_dropped);
+   dev_kfree_skb_any(skb);
+   return NETDEV_TX_OK;
+}
+
+static netdev_tx_t
+virtnet_bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+   struct virtnet_bypass_info *vbi = netdev_priv(dev);
+   struct net_device *xmit_dev;
+
+   /* Try

[RFC PATCH v3 1/3] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit

2018-02-16 Thread Sridhar Samudrala

This feature bit can be used by hypervisor to indicate virtio_net device to
act as a backup for another device with the same MAC address.

VIRTIO_NET_F_BACKUP is defined as bit 62 as it is a device feature bit.

Signed-off-by: Sridhar Samudrala 
---
 drivers/net/virtio_net.c| 2 +-
 include/uapi/linux/virtio_net.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 626c27352ae2..bcd13fe906ca 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2920,7 +2920,7 @@ static struct virtio_device_id id_table[] = {
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
VIRTIO_NET_F_CTRL_MAC_ADDR, \
VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
-   VIRTIO_NET_F_SPEED_DUPLEX
+   VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_BACKUP
 
 static unsigned int features[] = {
VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..c7c35fd1a5ed 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 
+#define VIRTIO_NET_F_BACKUP  62/* Act as backup for another device
+* with the same MAC.
+*/
 #define VIRTIO_NET_F_SPEED_DUPLEX 63   /* Device set linkspeed and duplex */
 
 #ifndef VIRTIO_NET_NO_LEGACY
-- 
2.14.3

[RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device

2018-02-16 Thread Sridhar Samudrala

Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
used by hypervisor to indicate that virtio_net interface should act as
a backup for another device with the same MAC address.

Ppatch 2 is in response to the community request for a 3 netdev
solution.  However, it creates some issues we'll get into in a moment.
It extends virtio_net to use alternate datapath when available and
registered. When BACKUP feature is enabled, virtio_net driver creates
an additional 'bypass' netdev that acts as a master device and controls
2 slave devices.  The original virtio_net netdev is registered as
'backup' netdev and a passthru/vf device with the same MAC gets
registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
associated with the same 'pci' device.  The user accesses the network
interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
as default for transmits when it is available with link up and running.

We noticed a couple of issues with this approach during testing.
- As both 'bypass' and 'backup' netdevs are associated with the same
  virtio pci device, udev tries to rename both of them with the same name
  and the 2nd rename will fail. This would be OK as long as the first netdev
  to be renamed is the 'bypass' netdev, but the order in which udev gets
  to rename the 2 netdevs is not reliable. 
- When the 'active' netdev is unplugged OR not present on a destination
  system after live migration, the user will see 2 virtio_net netdevs.

Patch 3 refactors much of the changes made in patch 2, which was done on 
purpose just to show the solution we recommend as part of one patch set.  
If we submit a final version of this, we would combine patch 2/3 together.
This patch removes the creation of an additional netdev, Instead, it
uses a new virtnet_bypass_info struct added to the original 'backup' netdev
to track the 'bypass' information and introduces an additional set of ndo and 
ethtool ops that are used when BACKUP feature is enabled.

One difference with the 3 netdev model compared to the 2 netdev model is that
the 'bypass' netdev is created with 'noqueue' qdisc marked as 'NETIF_F_LLTX'. 
This avoids going through an additional qdisc and acquiring an additional
qdisc and tx lock during transmits.
If we can replace the qdisc of virtio netdev dynamically, it should be
possible to get these optimizations enabled even with 2 netdev model when
BACKUP feature is enabled.

As this patch series is initially focusing on usecases where hypervisor 
fully controls the VM networking and the guest is not expected to directly 
configure any hardware settings, it doesn't expose all the ndo/ethtool ops
that are supported by virtio_net at this time. To support additional usecases,
it should be possible to enable additional ops later by caching the state
in virtio netdev and replaying when the 'active' netdev gets registered. 
 
The hypervisor needs to enable only one datapath at any time so that packets
don't get looped back to the VM over the other datapath. When a VF is
plugged, the virtio datapath link state can be marked as down.
At the time of live migration, the hypervisor needs to unplug the VF device
from the guest on the source host and reset the MAC filter of the VF to
initiate failover of datapath to virtio before starting the migration. After
the migration is completed, the destination hypervisor sets the MAC filter
on the VF and plugs it back to the guest to switch over to VF datapath.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization=151189725224231=2

Sridhar Samudrala (3):
  virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
  virtio_net: Extend virtio to use VF datapath when available
  virtio_net: Enable alternate datapath without creating an additional
netdev

 drivers/net/virtio_net.c| 564 +++-
 include/uapi/linux/virtio_net.h |   3 +
 2 files changed, 563 insertions(+), 4 deletions(-)

-- 
2.14.3

[PATCH] net: dsa: mv88e6xxx: hwtstamp: remove unnecessary range checking tests

2018-02-16 Thread Gustavo A. R. Silva

_port_ is already known to be a valid index in the callers [1]. So
these checks are unnecessary.

[1] https://lkml.org/lkml/2018/2/16/469

Addresses-Coverity-ID: 1465287
Addresses-Coverity-ID: 1465291
Suggested-by: Richard Cochran 
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/dsa/mv88e6xxx/hwtstamp.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c 
b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
index b251d53..c6d6a35 100644
--- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c
+++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
@@ -179,9 +179,6 @@ int mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, int 
port,
if (!chip->info->ptp_support)
return -EOPNOTSUPP;
 
-   if (port < 0 || port >= mv88e6xxx_num_ports(chip))
-   return -EINVAL;
-
if (copy_from_user(, ifr->ifr_data, sizeof(config)))
return -EFAULT;
 
@@ -206,9 +203,6 @@ int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int 
port,
if (!chip->info->ptp_support)
return -EOPNOTSUPP;
 
-   if (port < 0 || port >= mv88e6xxx_num_ports(chip))
-   return -EINVAL;
-
return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ?
-EFAULT : 0;
 }
@@ -255,9 +249,6 @@ static u8 *mv88e6xxx_should_tstamp(struct mv88e6xxx_chip 
*chip, int port,
if (!chip->info->ptp_support)
return NULL;
 
-   if (port < 0 || port >= mv88e6xxx_num_ports(chip))
-   return NULL;
-
hdr = parse_ptp_header(skb, type);
if (!hdr)
return NULL;
-- 
2.7.4

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Holger Hoffstätte

On 02/16/18 18:25, Oleksandr Natalenko wrote:
> So, going on with two real HW hosts. They are both running latest stock Arch 
> Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are 
> interconnected with 1 Gbps link (via switch if that matters). Using iperf3, 
> running each test for 20 seconds.
> 
> Having BBR+fq_codel (or pfifo_fast, same result) on both hosts:
> 
> Client to server: 112 Mbits/sec
> Server to client: 96.1 Mbits/sec
> 
> Having BBR+fq on both hosts:
> 
> Client to server: 347 Mbits/sec
> Server to client: 397 Mbits/sec
> 
> Having YeAH+fq on both hosts:
> 
> Client to server: 928 Mbits/sec
> Server to client: 711 Mbits/sec
> 
> (when the server generates traffic, the throughput is a little bit lower, as 
> you can see, but I assume that's because I have there low-power Silvermont 
> CPU, when the client has Ivy Bridge beast)

There is simply no reason why you shouldn't get approx. line rate (~920+-ish)
Mbit over wired 1GBit Ethernet; even my broken 10-year old Core2Duo laptop can
do that. Can you boot with spectre_v2=off and try "the simplest case" with the
defaults cubic/pfifo_fast? spectre_v2 has terrible performance impact esp.
on small/older processors.

When I last benchmarked full PREEMPT with 4.9.x it was similarly bad and also
had a noticeable network throughput impact even on my i7.

Also congratulations for being the only other person I know who ever tried
YeAH. :-)

cheers
Holger

Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read

2018-02-16 Thread Gustavo A. R. Silva




On 02/16/2018 09:56 AM, Richard Cochran wrote:

On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote:

On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote:

_port_ is being used as index to array port_hwtstamp before verifying
it is a non-negative number and a valid index at line 209 and 258:

if (port < 0 || port >= mv88e6xxx_num_ports(chip))

Fix this by checking _port_ before using it as index to array
port_hwtstamp.


NAK.   Port is already known to be valid in the callers.


And so the real bug is the pointless range checking tests.  I would
welcome patches to remove those.



I just sent a patch for this.

Thank you both, Andrew and Richard for the feedback.
--
Gustavo

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi.

On pátek 16. února 2018 17:25:58 CET Eric Dumazet wrote:
> The way TCP pacing works, it defaults to internal pacing using a hint
> stored in the socket.
> 
> If you change the qdisc while flow is alive, result could be unexpected.

I don't change a qdisc while flow is alive. Either the VM is completely 
restarted, or iperf3 is restarted on both sides.

> (TCP socket remembers that one FQ was supposed to handle the pacing)
> 
> What results do you have if you use standard pfifo_fast ?

Almost the same as with fq_codel (see my previous email with numbers).

> I am asking because TCP pacing relies on High resolution timers, and
> that might be weak on your VM.

Also, I've switched to measuring things on a real HW only (also see previous 
email with numbers).

Thanks.

Regards,
  Oleksandr

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi.

On pátek 16. února 2018 17:26:11 CET Holger Hoffstätte wrote:
> These are very odd configurations. :)
> Non-preempt/100 might well be too slow, whereas PREEMPT/1000 might simply
> have too much overhead.

Since the pacing is based on hrtimers, should HZ matter at all? Even if so, 
poor 1 Gbps link shouldn't drop to below 100 Mbps, for sure.

> BBR in general will run with lower cwnd than e.g. Cubic or others.
> That's a feature and necessary for WAN transfers.

Okay, got it.

> Something seems really wrong with your setup. I get completely
> expected throughput on wired 1Gb between two hosts:
> /* snip */

Yes, and that's strange :/. And that's why I'm wondering what I am missing 
since things cannot be *that* bad.

> /* snip */
> Please note that BBR was developed to address the case of WAN transfers
> (or more precisely high BDP paths) which often suffer from TCP throughput
> collapse due to single packet loss events. While it might "work" in other
> scenarios as well, strictly speaking delay-based anything is increasingly
> less likely to work when there is no meaningful notion of delay - such
> as on a LAN. (yes, this is very simplified..)
> 
> The BBR mailing list has several nice reports why the current BBR
> implementation (dubbed v1) has a few - sometimes severe - problems.
> These are being addressed as we speak.
> 
> (let me know if you want some of those tech reports by email. :)

Well, yes, please, why not :).

> /* snip */
> I'm not sure testing the old version without builtin pacing is going to help
> matters in finding the actual problem. :)
> Several people have reported severe performance regressions with 4.15.x,
> maybe that's related. Can you test latest 4.14.x?

Observed this on v4.14 too but didn't pay much attention until realised that 
things look definitely wrong.

> Out of curiosity, what is the expected use case for BBR here?

Nothing special, just assumed it could be set as a default for both WAN and 
LAN usage.

Regards,
  Oleksandr

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi.

On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote:
> Thanks for the detailed report! Yes, this sounds like an issue in BBR. We
> have not run into this one in our team, but we will try to work with you to
> fix this.
> 
> Would you be able to take a sender-side tcpdump trace of the slow BBR
> transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be
> fine. Maybe something like:
> 
>   tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT

So, going on with two real HW hosts. They are both running latest stock Arch 
Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are 
interconnected with 1 Gbps link (via switch if that matters). Using iperf3, 
running each test for 20 seconds.

Having BBR+fq_codel (or pfifo_fast, same result) on both hosts:

Client to server: 112 Mbits/sec
Server to client: 96.1 Mbits/sec

Having BBR+fq on both hosts:

Client to server: 347 Mbits/sec
Server to client: 397 Mbits/sec

Having YeAH+fq on both hosts:

Client to server: 928 Mbits/sec
Server to client: 711 Mbits/sec

(when the server generates traffic, the throughput is a little bit lower, as 
you can see, but I assume that's because I have there low-power Silvermont 
CPU, when the client has Ivy Bridge beast)

Now, to tcpdump. I've captured it 2 times, for client-to-server flow (c2s) and 
for server-to-client flow (s2c) while using BBR + pfifo_fast:

# tcpdump -w test_XXX.pcap -c100 -s 100 -i enp2s0 port 5201

I've uploaded both files here [1].

Thanks.

Oleksandr

[1] https://natalenko.name/myfiles/bbr/

Re: [RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case

2018-02-16 Thread John Fastabend

On 02/16/2018 07:41 AM, Jesper Dangaard Brouer wrote:
> On Fri, 16 Feb 2018 13:31:37 +0800
> Jason Wang  wrote:
> 
>> On 2018年02月16日 06:43, Jesper Dangaard Brouer wrote:
>>> The virtio_net code have three different RX code-paths in receive_buf().
>>> Two of these code paths can handle XDP, but one of them is broken for
>>> at least XDP_REDIRECT.
>>>
>>> Function(1): receive_big() does not support XDP.
>>> Function(2): receive_small() support XDP fully and uses build_skb().
>>> Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb().
>>>
>>> The simple explanation is that receive_mergeable() is broken because
>>> it uses napi_alloc_skb(), which violates XDP given XDP assumes packet
>>> header+data in single page and enough tail room for skb_shared_info.
>>>
>>> The longer explaination is that receive_mergeable() tries to
>>> work-around and satisfy these XDP requiresments e.g. by having a
>>> function xdp_linearize_page() that allocates and memcpy RX buffers
>>> around (in case packet is scattered across multiple rx buffers).  This
>>> does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because
>>> we have not implemented bpf_xdp_adjust_tail yet).
>>>
>>> The XDP_REDIRECT action combined with cpumap is broken, and cause hard
>>> to debug crashes.  The main issue is that the RX packet does not have
>>> the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing
>>> skb_shared_info to overlap the next packets head-room (in which cpumap
>>> stores info).
>>>
>>> Reproducing depend on the packet payload length and if RX-buffer size
>>> happened to have tail-room for skb_shared_info or not.  But to make
>>> this even harder to troubleshoot, the RX-buffer size is runtime
>>> dynamically change based on an Exponentially Weighted Moving Average
>>> (EWMA) over the packet length, when refilling RX rings.
>>>
>>> This patch only disable XDP_REDIRECT support in receive_mergeable()
>>> case, because it can cause a real crash.
>>>
>>> But IMHO we should NOT support XDP in receive_mergeable() at all,
>>> because the principles behind XDP are to gain speed by (1) code
>>> simplicity, (2) sacrificing memory and (3) where possible moving
>>> runtime checks to setup time.  These principles are clearly being
>>> violated in receive_mergeable(), that e.g. runtime track average
>>> buffer size to save memory consumption.  
>>
>> I agree to disable it for -net now. 
> 
> Okay... I'll send an official patch later.
> 
>> For net-next, we probably can do:
>>
>> - drop xdp_linearize_page() and do XDP through generic XDP helper
>>   after skb was built
> 
> I disagree strongly here - it makes no sense.
> 
> Why do you want to explicit fallback to Generic-XDP?
> (... then all the performance gain is gone!)
> And besides, a couple of function calls later, the generic XDP code
> will/can get invoked anyhow...
> 

Hi, Can we get EWMA to ensure for majority of cases we have the extra
head room? Seems we could just over-estimate the size by N-bytes. In
some cases we may under-estimate and then would need to fall back to
generic-xdp or otherwise growing the buffer which of course would be
painful and slow, but presumably would happen rarely.

I think it would be much better to keep this feature vs kill it and
make its configuration even more painful to get XDP working on virtio.

Disabling EWMA also seems reasonable to me.

> 
> Take a step back:
>  What is the reason/use-case for implementing XDP inside virtio_net?
> 
> From a DDoS/performance perspective XDP in virtio_net happens on the
> "wrong-side" as it is activated _inside_ the guest OS, which is too
> late for a DDoS filter, as the guest kick/switch overhead have already
> occurred.
> 

The hypervisor may not "know" how to detect DDOS if its specific to
the VMs domain. In these cases we aren't measuring pps but are looking
at cycles/packet. In this case dropping cycles/packet frees up CPU
for useful work. Here I expect we can see real CPU % drop in VM by
using XDP.

The other use case is once we have a fast path NIC to VM in kernel
we can expect, from you numbers below, 3+Mpps. I seem to remember
from Jason's netdevconf talk that he had some experimental numbers
that were even better. The other case is the hypervisor is not Linux
and is feeding packets even faster DPDK numbers, again from Jason's
slides, seemed to show 11+Mpps. XDP makes a lot of sense here IMO.

(those specific pps numbers I pulled from memory but the point is
feeding many Mpps into a VM should be doable)

The last thing is we may see hardware VFs emulating virtio at some
point. Then XDP would be needed. With the newer virtio spec under
development my impression is the hardware emulation piece is becoming
more of a focus. But, someone actually working on that could probably
provide a more informed comment.

> I do use XDP_DROP inside the guest (driver virtio_net), but just to
> perform what I can zoom-in benchmarking, for perf-record isolating the
> early RX

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Holger Hoffstätte

On 02/16/18 17:56, Neal Cardwell wrote:
> On Fri, Feb 16, 2018 at 11:26 AM, Holger Hoffstätte
>  wrote:
>>
>> BBR in general will run with lower cwnd than e.g. Cubic or others.
>> That's a feature and necessary for WAN transfers.
> 
> Please note that there's no general rule about whether BBR will run
> with a lower or higher cwnd than CUBIC, Reno, or other loss-based
> congestion control algorithms. Whether BBR's cwnd will be lower or
> higher depends on the BDP of the path, the amount of buffering in the
> bottleneck, and the number of flows. BBR tries to match the amount of
> in-flight data to the BDP based on the available bandwidth and the
> two-way propagation delay. This will usually produce an amount of data
> in flight that is smaller than CUBIC/Reno (yielding lower latency) if
> the path has deep buffers (bufferbloat), but can be larger than
> CUBIC/Reno (yielding higher throughput) if the buffers are shallow and
> the traffic is suffering burst losses.

In all my tests I've never seen it larger, but OK. Thanks for the
explanation. :)
On second reading the "necessary for WAN transfers" was phrased a bit
unfortunately, but it likely doesn't matter for Oleksandr's case
anyway..

(snip)

>> Something seems really wrong with your setup. I get completely
>> expected throughput on wired 1Gb between two hosts:
>>
>> Connecting to host tux, port 5201
>> [  5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201
>> [ ID] Interval   Transfer Bitrate Retr  Cwnd
>> [  5]   0.00-1.00   sec   113 MBytes   948 Mbits/sec0204 KBytes
>> [  5]   1.00-2.00   sec   112 MBytes   941 Mbits/sec0204 KBytes
>> [  5]   2.00-3.00   sec   112 MBytes   941 Mbits/sec0204 KBytes
>> [...]
>>
>> Running it locally gives the more or less expected results as well:
>>
>> Connecting to host ragnarok, port 5201
>> [  5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201
>> [ ID] Interval   Transfer Bitrate Retr  Cwnd
>> [  5]   0.00-1.00   sec  8.09 GBytes  69.5 Gbits/sec0512 KBytes
>> [  5]   1.00-2.00   sec  8.14 GBytes  69.9 Gbits/sec0512 KBytes
>> [  5]   2.00-3.00   sec  8.43 GBytes  72.4 Gbits/sec0512 KBytes
>> [...]
>>
>> Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere).
> 
> Can you please clarify if this is over bare metal or between VM
> guests? It sounds like Oleksandr's initial report was between KVM VMs,
> so the virtualization may be an ingredient here.

These are real hosts, not VMs, wired by 1Gbit Ethernet (home office).
Like Eric said it's probably weird HZ, slow host, iffy high-res timer
(bad for both fq and fq_codel), overhead of retpoline in a VM or whatnot.

cheers
Holger

[PATCH v3 1/2] dt-bindings: net: Add bindings for National Instruments XGE netdev

2018-02-16 Thread Moritz Fischer

This adds bindings for the NI XGE 1G/10G network device.

Signed-off-by: Moritz Fischer 
---

Changes from v2:
- Addressed Rob's comments w.r.t to IRQ names and typo

Changes from v1:
- Corrected from nixge -> nixge.txt

---
 Documentation/devicetree/bindings/net/nixge.txt | 32 +
 1 file changed, 32 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/nixge.txt

diff --git a/Documentation/devicetree/bindings/net/nixge.txt 
b/Documentation/devicetree/bindings/net/nixge.txt
new file mode 100644
index ..e55af7f0881a
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/nixge.txt
@@ -0,0 +1,32 @@
+* NI XGE Ethernet controller
+
+Required properties:
+- compatible: Should be "ni,xge-enet-2.00"
+- reg: Address and length of the register set for the device
+- interrupts: Should contain tx and rx interrupt
+- interrupt-names: Should be "rx" and "tx"
+- phy-mode: See ethernet.txt file in the same directory.
+- phy-handle: See ethernet.txt file in the same directory.
+- nvmem-cells: Phandle of nvmem cell containing the MAC address
+- nvmem-cell-names: Should be "address"
+
+Examples (10G generic PHY):
+   nixge0: ethernet@4000 {
+   compatible = "ni,xge-enet-2.00";
+   reg = <0x4000 0x6000>;
+
+   nvmem-cells = <_addr>;
+   nvmem-cell-names = "address";
+
+   interrupts = <0 29 IRQ_TYPE_LEVEL_HIGH>, <0 30 
IRQ_TYPE_LEVEL_HIGH>;
+   interrupt-names = "rx", "tx";
+   interrupt-parent = <>;
+
+   phy-mode = "xgmii";
+   phy-handle = <_phy1>;
+
+   ethernet_phy1: ethernet-phy@4 {
+   compatible = "ethernet-phy-ieee802.3-c45";
+   reg = <4>;
+   };
+   };
-- 
2.16.1

[PATCH v3 2/2] net: ethernet: nixge: Add support for National Instruments XGE netdev

2018-02-16 Thread Moritz Fischer

Add support for the National Instruments XGE 1/10G network device.

It uses the EEPROM on the board via NVMEM.

Signed-off-by: Moritz Fischer 
---

Changes from v2:
- Implement recv side NAPI
- Improved error handling
- Implemented C45 writes
- Added ethtool callbacks & blink functionality
- Improved nixge_ctrl_poll_timeout() macro
- Removed dev_dbg() for mdio accesses
- Added businfo to ethtool drvinfo

Changes from v1:
- Added dependency on ARCH_ZYNQ (Kbuild)
- Removed unused variables
- Use of_phy_connect as suggested
- Removed masking of (un)supported modes
- Added #define for some constants
- Removed empty pm functions
- Reworked mac_address handling
- Made nixge_mdio_*() static (sparse)
- Removed driver version
- Addressed timeout loop
- Adressed return values on timeout

---
 drivers/net/ethernet/Kconfig |1 +
 drivers/net/ethernet/Makefile|1 +
 drivers/net/ethernet/ni/Kconfig  |   27 +
 drivers/net/ethernet/ni/Makefile |1 +
 drivers/net/ethernet/ni/nixge.c  | 1352 ++
 5 files changed, 1382 insertions(+)
 create mode 100644 drivers/net/ethernet/ni/Kconfig
 create mode 100644 drivers/net/ethernet/ni/Makefile
 create mode 100644 drivers/net/ethernet/ni/nixge.c

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index b6cf4b6962f5..908218561fdd 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -129,6 +129,7 @@ config FEALNX
 
 source "drivers/net/ethernet/natsemi/Kconfig"
 source "drivers/net/ethernet/netronome/Kconfig"
+source "drivers/net/ethernet/ni/Kconfig"
 source "drivers/net/ethernet/8390/Kconfig"
 
 config NET_NETX
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 3cdf01e96e0b..d732e9522b76 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_NET_VENDOR_MYRI) += myricom/
 obj-$(CONFIG_FEALNX) += fealnx.o
 obj-$(CONFIG_NET_VENDOR_NATSEMI) += natsemi/
 obj-$(CONFIG_NET_VENDOR_NETRONOME) += netronome/
+obj-$(CONFIG_NET_VENDOR_NI) += ni/
 obj-$(CONFIG_NET_NETX) += netx-eth.o
 obj-$(CONFIG_NET_VENDOR_NUVOTON) += nuvoton/
 obj-$(CONFIG_NET_VENDOR_NVIDIA) += nvidia/
diff --git a/drivers/net/ethernet/ni/Kconfig b/drivers/net/ethernet/ni/Kconfig
new file mode 100644
index ..cd30f7de16de
--- /dev/null
+++ b/drivers/net/ethernet/ni/Kconfig
@@ -0,0 +1,27 @@
+#
+# National Instuments network device configuration
+#
+
+config NET_VENDOR_NI
+   bool "National Instruments Devices"
+   default y
+   ---help---
+ If you have a network (Ethernet) device belonging to this class, say 
Y.
+
+ Note that the answer to this question doesn't directly affect the
+ kernel: saying N will just cause the configurator to skip all
+ the questions about National Instrument devices.
+ If you say Y, you will be asked for your specific device in the
+ following questions.
+
+if NET_VENDOR_NI
+
+config NI_XGE_MANAGEMENT_ENET
+   tristate "National Instruments XGE management enet support"
+   depends on ARCH_ZYNQ
+   select PHYLIB
+   ---help---
+ Simple LAN device for debug or management purposes. Can
+ support either 10G or 1G PHYs via SFP+ ports.
+
+endif
diff --git a/drivers/net/ethernet/ni/Makefile b/drivers/net/ethernet/ni/Makefile
new file mode 100644
index ..99c664651c51
--- /dev/null
+++ b/drivers/net/ethernet/ni/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NI_XGE_MANAGEMENT_ENET) += nixge.o
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
new file mode 100644
index ..9b255c23d7cd
--- /dev/null
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -0,0 +1,1352 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2016-2017, National Instruments Corp.
+ *
+ * Author: Moritz Fischer 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define TX_BD_NUM  64
+#define RX_BD_NUM  128
+
+/* Axi DMA Register definitions */
+
+#define XAXIDMA_TX_CR_OFFSET   0x /* Channel control */
+#define XAXIDMA_TX_SR_OFFSET   0x0004 /* Status */
+#define XAXIDMA_TX_CDESC_OFFSET0x0008 /* Current descriptor 
pointer */
+#define XAXIDMA_TX_TDESC_OFFSET0x0010 /* Tail descriptor pointer */
+
+#define XAXIDMA_RX_CR_OFFSET   0x0030 /* Channel control */
+#define XAXIDMA_RX_SR_OFFSET   0x0034 /* Status */
+#define XAXIDMA_RX_CDESC_OFFSET0x0038 /* Current descriptor 
pointer */
+#define XAXIDMA_RX_TDESC_OFFSET0x0040 /* Tail descriptor pointer */
+
+#define XAXIDMA_CR_RUNSTOP_MASK0x0001 /* Start/stop DMA channel */
+#define XAXIDMA_CR_RESET_MASK  0x0004 /* Reset DMA engine */
+
+#define XAXIDMA_BD_NDESC_OFFSET0x00 /* Next

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi!

On pátek 16. února 2018 17:45:56 CET Neal Cardwell wrote:
> Eric raises a good question: bare metal vs VMs.
> 
> Oleksandr, your first email mentioned KVM VMs and virtio NICs. Your
> second e-mail did not seem to mention if those results were for bare
> metal or a VM scenario: can you please clarify the details on your
> second set of tests?

Ugh, so many letters simultaneously… I'll answer them one by one if you don't 
mind :).

Both the first and the second set of tests were performed on 2 KVM VMs, but 
from now I'll test everything using real HW only to exclude potential 
influence of virtualisation. Also, as I've already pointed out, on the real HW 
the difference is even bigger (~10 times).

Now, I'm going to answer other emails of yours, including the actual results 
from the real HW and tcpdump output as requested.

Thanks!

Regards,
  Oleksandr

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Neal Cardwell

On Fri, Feb 16, 2018 at 11:26 AM, Holger Hoffstätte
 wrote:
>
> BBR in general will run with lower cwnd than e.g. Cubic or others.
> That's a feature and necessary for WAN transfers.

Please note that there's no general rule about whether BBR will run
with a lower or higher cwnd than CUBIC, Reno, or other loss-based
congestion control algorithms. Whether BBR's cwnd will be lower or
higher depends on the BDP of the path, the amount of buffering in the
bottleneck, and the number of flows. BBR tries to match the amount of
in-flight data to the BDP based on the available bandwidth and the
two-way propagation delay. This will usually produce an amount of data
in flight that is smaller than CUBIC/Reno (yielding lower latency) if
the path has deep buffers (bufferbloat), but can be larger than
CUBIC/Reno (yielding higher throughput) if the buffers are shallow and
the traffic is suffering burst losses.

>
>>> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput
>>> to ~100 Mbps (verifiable not only by iperf3, but also by scp while
>>> transferring some files between hosts).
>
> Something seems really wrong with your setup. I get completely
> expected throughput on wired 1Gb between two hosts:
>
> Connecting to host tux, port 5201
> [  5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201
> [ ID] Interval   Transfer Bitrate Retr  Cwnd
> [  5]   0.00-1.00   sec   113 MBytes   948 Mbits/sec0204 KBytes
> [  5]   1.00-2.00   sec   112 MBytes   941 Mbits/sec0204 KBytes
> [  5]   2.00-3.00   sec   112 MBytes   941 Mbits/sec0204 KBytes
> [...]
>
> Running it locally gives the more or less expected results as well:
>
> Connecting to host ragnarok, port 5201
> [  5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201
> [ ID] Interval   Transfer Bitrate Retr  Cwnd
> [  5]   0.00-1.00   sec  8.09 GBytes  69.5 Gbits/sec0512 KBytes
> [  5]   1.00-2.00   sec  8.14 GBytes  69.9 Gbits/sec0512 KBytes
> [  5]   2.00-3.00   sec  8.43 GBytes  72.4 Gbits/sec0512 KBytes
> [...]
>
> Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere).

Can you please clarify if this is over bare metal or between VM
guests? It sounds like Oleksandr's initial report was between KVM VMs,
so the virtualization may be an ingredient here.

thanks,
neal

Re: [PATCH iproute2-next v5 0/9] ipaddress: Make print_linkinfo_brief() static

2018-02-16 Thread Serhey Popovych

David Ahern wrote:
> On 2/15/18 2:23 PM, Serhey Popovych wrote:
>> With this series I propose to make print_linkinfo_brief() static in
>> favor of print_linkinfo() as single point for linkinfo printing.
>>
> 
> ...
> 
>>
>> Thanks,
>> Serhii
>>
>> Serhey Popovych (9):
>>   ipaddress: Abstract IFA_LABEL matching code
>>   ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name()
>>   utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n()
>>   ipaddress: Improve print_linkinfo()
>>   ipaddress: Simplify print_linkinfo_brief() and it's usage
>>   lib: Correct object file dependencies
>>   utils: Introduce and use get_ifname_rta()
>>   utils: Introduce and use print_name_and_link() to print name@link
>>   ipaddress: Make print_linkinfo_brief() static
>>
>>  Makefile |2 +-
>>  bridge/link.c|   21 ++---
>>  include/ll_map.h |4 +-
>>  include/utils.h  |5 ++
>>  ip/ip_common.h   |2 -
>>  ip/ipaddress.c   |  231 
>> ++
>>  ip/iplink.c  |5 +-
>>  lib/Makefile |4 +-
>>  lib/ll_map.c |   31 +---
>>  lib/utils.c  |   68 
>>  10 files changed, 167 insertions(+), 206 deletions(-)
>>
> 
> Series applied after the pretty fixup from a separate patch.
> 
> Serhey: please cc me on patches for iproute2-next; I have a separate
> email account for netdev mail and at times it gets way behind the netdev
> list.
> 

Accepted, will cc when sending next patches. Thank you.



signature.asc
Description: OpenPGP digital signature

[PATCH][V2] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift

2018-02-16 Thread Colin King

From: Colin Ian King 

The shifting of timehi by 16 bits to the left will be promoted to
a 32 bit signed int and then sign-extended to an u64. If the top bit
of timehi is set then all then all the upper bits of ns end up as also
being set because of the sign-extension. Fix this by making timehi and
timelo u64.  Also move the declaration of ns.

Detected by CoverityScan, CID#1465288 ("Unintended sign extension")

Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support")
Signed-off-by: Colin Ian King 
---
 drivers/net/dsa/mv88e6xxx/hwtstamp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c 
b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
index b251d534b70d..2149d332dea0 100644
--- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c
+++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
@@ -293,7 +293,8 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip,
   struct sk_buff *skb, u16 reg,
   struct sk_buff_head *rxq)
 {
-   u16 buf[4] = { 0 }, status, timelo, timehi, seq_id;
+   u16 buf[4] = { 0 }, status, seq_id;
+   u64 ns, timelo, timehi;
struct skb_shared_hwtstamps *shwt;
int err;
 
@@ -321,7 +322,7 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip,
 */
for ( ; skb; skb = skb_dequeue(rxq)) {
if (mv88e6xxx_ts_valid(status) && seq_match(skb, seq_id)) {
-   u64 ns = timehi << 16 | timelo;
+   ns = timehi << 16 | timelo;
 
mutex_lock(>reg_lock);
ns = timecounter_cyc2time(>tstamp_tc, ns);
-- 
2.15.1

Re: [PATCH] i40evf: remove redundant array comparisons to 0 checks

2018-02-16 Thread Colin Ian King

On 16/02/18 16:51, Andy Shevchenko wrote:
> On Thu, Feb 15, 2018 at 9:42 PM, Colin King  wrote:
>> From: Colin Ian King 
>>
>> The checks to see if key->dst.s6_addr and key->src.s6_addr are null
>> pointers are redundant because these are constant size arrays and
>> so the checks always return true.  Fix this by removing the redundant
>> checks.
> 
>> +   for (i = 0; i < 4; i++)
> 
>> +   filter->f.mask.tcp_spec.dst_ip[i] |=
>> 
>> cpu_to_be32(0x);
> 
> Can it be one line then?

I re-adjusted the text because checkpatch was complaining.

> 
>> +   memcpy(>f.data.tcp_spec.dst_ip,
>> +  >dst.s6_addr32,
> 
> Ditto.
> 
>> +  sizeof(filter->f.data.tcp_spec.dst_ip));
>> +
>> +   for (i = 0; i < 4; i++)
> 
>> +   filter->f.mask.tcp_spec.src_ip[i] |=
>> 
>> cpu_to_be32(0x);
> 
> Ditto.
> 
>> +   memcpy(>f.data.tcp_spec.src_ip,
>> +  >src.s6_addr32,
> 
> Ditto.
> 
>> +  sizeof(filter->f.data.tcp_spec.src_ip));
>

Re: [PATCH RFC 0/4] net: add bpfilter

2018-02-16 Thread Daniel Borkmann

Hi Florian,

thanks for your feedback! More inline:

On 02/16/2018 03:57 PM, Florian Westphal wrote:
> Daniel Borkmann  wrote:
>> This is a very rough and early proof of concept that implements bpfilter.
> 
> [..]
> 
>> Also, as a benefit from such design, we get BPF JIT compilation on x86_64,
>> arm64, ppc64, sparc64, mips64, s390x and arm32, but also rule offloading
>> into HW for free for Netronome NFP SmartNICs that are already capable of
>> offloading BPF since we can reuse all existing BPF infrastructure as the
>> back end. The user space iptables binary issuing rule addition or dumps was
>> left as-is, thus at some point any binaries against iptables uapi kernel
>> interface could transparently be supported in such manner in long term.
>>
>> As rule translation can potentially become very complex, this is performed
>> entirely in user space. In order to ease deployment, request_module() code
>> is extended to allow user mode helpers to be invoked. Idea is that user mode
>> helpers are built as part of the kernel build and installed as traditional
>> kernel modules with .ko file extension into distro specified location,
>> such that from a distribution point of view, they are no different than
>> regular kernel modules. Thus, allow request_module() logic to load such
>> user mode helper (umh) binaries via:
>>
>>   request_module("foo") ->
>> call_umh("modprobe foo") ->
>>   sys_finit_module(FD of /lib/modules/.../foo.ko) ->
>> call_umh(struct file)
>>
>> Such approach enables kernel to delegate functionality traditionally done
>> by kernel modules into user space processes (either root or !root) and
>> reduces security attack surface of such new code, meaning in case of
>> potential bugs only the umh would crash but not the kernel. Another
>> advantage coming with that would be that bpfilter.ko can be debugged and
>> tested out of user space as well (e.g. opening the possibility to run
>> all clang sanitizers, fuzzers or test suites for checking translation).
> 
> Several questions spinning at the moment, I will probably come up with
> more:

Sure, no problem at all. It's an early RFC, so purpose is to get a
discussion going on such potential approach.

> 1. Does this still attach the binary blob to the 'normal' iptables
>hooks?

Yeah, so thought would be to keep the user land tooling functional as
is w/o having to recompile binaries, thus this would also need to attach
for the existing hooks in order to keep semantics working. As a benefit
in addition we can also reuse all the rest of the infrastructure to utilize
things like XDP for iptables in the background, there is definitely
flexibility on this side thus users could eventually benefit from this
transparently and don't need to know that 'bpfilter' exists and is
translating in the background. I realize taking this path is a long term
undertake that we would need to tackle as a community, not just one or
two individuals when we decide to go for this direction.

> 2. If yes, do you see issues wrt. 'iptables' and 'bpfilter' attached
> programs being different in nature (e.g. changed by different entities)?

There could certainly be multiple options, e.g. a fall-through with state
transfer once a request cannot be handled yet or a sysctl with iptables
being the default handler and an option to switch to bpfilter for letting
it handle requests for that time being.

> 3. What happens if the rule can't be translated (yet?)

(See above.)

> 4. Do you plan to reimplement connection tracking in userspace?

One option could be to have a generic, skb-less connection tracker in kernel
that can be reused from the various hooks it would need to handle, potentially
that would also be able to get offloaded into HW as another benefit coming
out from that.

> If no, how will the bpf program interact with it?
> [ same question applies to ipv6 exthdr traversal, ip defragmentation
> and the like ].

The v6 exthdr traversal could be realized natively via BPF which should
make the parsing more robust at the same time than having it somewhere
inside a helper in kernel directly; bounded loops in BPF would help as
well on that front, similarly for defrag this could be handled by the prog
although here we would need additional infra to queue the packets and then
recirculate.

> I will probably have a quadrillion of followup questions, sorry :-/

Definitely, please do!

Thanks,
Daniel

>> Also, such architecture makes the kernel/user boundary very precise,
>> meaning requests can be handled and BPF translated in control plane part
>> in user space with its own user memory etc, while minimal data plane
>> bits are in kernel. It would also allow to remove old xtables modules
>> at some point from the kernel while keeping functionality in place.
> 
> This is what we tried with nftables :-/

Re: [PATCH] i40evf: remove redundant array comparisons to 0 checks

2018-02-16 Thread Andy Shevchenko

On Thu, Feb 15, 2018 at 9:42 PM, Colin King  wrote:
> From: Colin Ian King 
>
> The checks to see if key->dst.s6_addr and key->src.s6_addr are null
> pointers are redundant because these are constant size arrays and
> so the checks always return true.  Fix this by removing the redundant
> checks.

> +   for (i = 0; i < 4; i++)

> +   filter->f.mask.tcp_spec.dst_ip[i] |=
> 
> cpu_to_be32(0x);

Can it be one line then?

> +   memcpy(>f.data.tcp_spec.dst_ip,
> +  >dst.s6_addr32,

Ditto.

> +  sizeof(filter->f.data.tcp_spec.dst_ip));
> +
> +   for (i = 0; i < 4; i++)

> +   filter->f.mask.tcp_spec.src_ip[i] |=
> 
> cpu_to_be32(0x);

Ditto.

> +   memcpy(>f.data.tcp_spec.src_ip,
> +  >src.s6_addr32,

Ditto.

> +  sizeof(filter->f.data.tcp_spec.src_ip));

-- 
With Best Regards,
Andy Shevchenko

Re: Serious performance degradation in Linux 4.15

2018-02-16 Thread Peter Zijlstra

On Fri, Feb 16, 2018 at 02:38:39PM +, Matt Fleming wrote:
> On Wed, 14 Feb, at 10:46:20PM, Matt Fleming wrote:

> > Here's some more numbers. This is with RETPOLINE=y but you'll see it
> > doesn't make much of a difference. Oh, this is also with powersave
> > cpufreq governor.
> 
> Feh, I was wrong. The differences in performance I see are entirely
> due to CONFIG_RETPOLINE and CONFIG_PAGE_TABLE_ISOLATION being enabled.

OK, so I'm not the one crazy person not being able to reproduce this :-)

Lets wait for more details from the original submitter.

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Neal Cardwell

On Fri, Feb 16, 2018 at 11:43 AM, Eric Dumazet  wrote:
>
> On Fri, Feb 16, 2018 at 8:33 AM, Neal Cardwell  wrote:
> > Oleksandr,
> >
> > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We
> > have not run into this one in our team, but we will try to work with you to
> > fix this.
> >
> > Would you be able to take a sender-side tcpdump trace of the slow BBR
> > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be
> > fine. Maybe something like:
> >
> >   tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT
> >
> > Thanks!
> > neal
>
> On baremetal and using latest net tree, I get pretty normal results at
> least, on 40Gbit NIC,

Eric raises a good question: bare metal vs VMs.

Oleksandr, your first email mentioned KVM VMs and virtio NICs. Your
second e-mail did not seem to mention if those results were for bare
metal or a VM scenario: can you please clarify the details on your
second set of tests?

Thanks!

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

On Fri, Feb 16, 2018 at 8:33 AM, Neal Cardwell  wrote:
> Oleksandr,
>
> Thanks for the detailed report! Yes, this sounds like an issue in BBR. We
> have not run into this one in our team, but we will try to work with you to
> fix this.
>
> Would you be able to take a sender-side tcpdump trace of the slow BBR
> transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be
> fine. Maybe something like:
>
>   tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT
>
> Thanks!
> neal

On baremetal and using latest net tree, I get pretty normal results at
least, on 40Gbit NIC,
with pfifo_fast, fq and fq_codel.

# tc qd replace dev eth0 root fq
# ./super_netperf 1 -H lpaa24 -- -K cubic
  25627
# ./super_netperf 1 -H lpaa24 -- -K bbr
  25897
# tc qd replace dev eth0 root fq_codel
# ./super_netperf 1 -H lpaa24 -- -K cubic
  22246
# ./super_netperf 1 -H lpaa24 -- -K bbr
  25228
# tc qd replace dev eth0 root pfifo_fast
# ./super_netperf 1 -H lpaa24 -- -K cubic
  25454
# ./super_netperf 1 -H lpaa24 -- -K bbr
  25508

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Holger Hoffstätte

On 02/16/18 16:15, Oleksandr Natalenko wrote:
> Hi, David, Eric, Neal et al.
> 
> On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote:
>> I've faced an issue with a limited TCP bandwidth between my laptop and a
>> server in my 1 Gbps LAN while using BBR as a congestion control mechanism.
>> To verify my observations, I've set up 2 KVM VMs with the following
>> parameters:
>>
>> 1) Linux v4.15.3
>> 2) virtio NICs
>> 3) 128 MiB of RAM
>> 4) 2 vCPUs
>> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz

These are very odd configurations. :)
Non-preempt/100 might well be too slow, whereas PREEMPT/1000 might simply
have too much overhead.

>> The VMs are interconnected via host bridge (-netdev bridge). I was running
>> iperf3 in the default and reverse mode. Here are the results:
>>
>> 1) BBR on both VMs
>>
>> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes
>> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes
>>
>> 2) Reno on both VMs
>>
>> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant)
>> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant)
>>
>> 3) Reno on client, BBR on server
>>
>> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant)
>> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes
>>
>> 4) BBR on client, Reno on server
>>
>> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes
>> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant)
>>
>> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low.

BBR in general will run with lower cwnd than e.g. Cubic or others.
That's a feature and necessary for WAN transfers.

>> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput
>> to ~100 Mbps (verifiable not only by iperf3, but also by scp while
>> transferring some files between hosts).

Something seems really wrong with your setup. I get completely
expected throughput on wired 1Gb between two hosts:

Connecting to host tux, port 5201
[  5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201
[ ID] Interval   Transfer Bitrate Retr  Cwnd
[  5]   0.00-1.00   sec   113 MBytes   948 Mbits/sec0204 KBytes   
[  5]   1.00-2.00   sec   112 MBytes   941 Mbits/sec0204 KBytes   
[  5]   2.00-3.00   sec   112 MBytes   941 Mbits/sec0204 KBytes   
[...]

Running it locally gives the more or less expected results as well:

Connecting to host ragnarok, port 5201
[  5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201
[ ID] Interval   Transfer Bitrate Retr  Cwnd
[  5]   0.00-1.00   sec  8.09 GBytes  69.5 Gbits/sec0512 KBytes   
[  5]   1.00-2.00   sec  8.14 GBytes  69.9 Gbits/sec0512 KBytes   
[  5]   2.00-3.00   sec  8.43 GBytes  72.4 Gbits/sec0512 KBytes   
[...]

Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere).
In the past I only used BBR briefly for testing since at 1Gb speeds on my
LAN it was actually slightly slower than Cubic (some of those bugs were
recently addressed) and made no difference otherwise, even for uploads -
which are capped by my 50/10 DSL anyway.

Please note that BBR was developed to address the case of WAN transfers
(or more precisely high BDP paths) which often suffer from TCP throughput
collapse due to single packet loss events. While it might "work" in other
scenarios as well, strictly speaking delay-based anything is increasingly
less likely to work when there is no meaningful notion of delay - such
as on a LAN. (yes, this is very simplified..)

The BBR mailing list has several nice reports why the current BBR
implementation (dubbed v1) has a few - sometimes severe - problems.
These are being addressed as we speak.

(let me know if you want some of those tech reports by email. :)

>> Also, I've tried to use YeAH instead of Reno, and it gives me the same
>> results as Reno (IOW, YeAH works fine too).
>>
>> Questions:
>>
>> 1) is this expected?
>> 2) or am I missing some extra BBR tuneable?

No, it should work out of the box.

>> 3) if it is not a regression (I don't have any previous data to compare
>> with), how can I fix this?
>> 4) if it is a bug in BBR, what else should I provide or check for a proper
>> investigation?
> 
> I've played with BBR a little bit more and managed to narrow the issue down 
> to 
> the changes between v4.12 and v4.13. Here are my observations:
> 
> v4.12 + BBR + fq_codel == OK
> v4.12 + BBR + fq   == OK
> v4.13 + BBR + fq_codel == Not OK
> v4.13 + BBR + fq   == OK
> 
> I think this has something to do with an internal TCP implementation for 
> pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to 
> allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the 
> throughput is high and saturates the link, but if another qdisc is in use, 
> for 
> instance, fq_codel, the throughput drops. Just to be sure, I've also tried 
> pfifo_fast instead of fq_codel with the same outcome resulting in the low 
> throughput.

I'm

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

On Fri, Feb 16, 2018 at 7:15 AM, Oleksandr Natalenko
 wrote:
> Hi, David, Eric, Neal et al.
>
> On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote:
>> I've faced an issue with a limited TCP bandwidth between my laptop and a
>> server in my 1 Gbps LAN while using BBR as a congestion control mechanism.
>> To verify my observations, I've set up 2 KVM VMs with the following
>> parameters:
>>
>> 1) Linux v4.15.3
>> 2) virtio NICs
>> 3) 128 MiB of RAM
>> 4) 2 vCPUs
>> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz
>>
>> The VMs are interconnected via host bridge (-netdev bridge). I was running
>> iperf3 in the default and reverse mode. Here are the results:
>>
>> 1) BBR on both VMs
>>
>> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes
>> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes
>>
>> 2) Reno on both VMs
>>
>> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant)
>> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant)
>>
>> 3) Reno on client, BBR on server
>>
>> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant)
>> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes
>>
>> 4) BBR on client, Reno on server
>>
>> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes
>> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant)
>>
>> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low.
>> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput
>> to ~100 Mbps (verifiable not only by iperf3, but also by scp while
>> transferring some files between hosts).
>>
>> Also, I've tried to use YeAH instead of Reno, and it gives me the same
>> results as Reno (IOW, YeAH works fine too).
>>
>> Questions:
>>
>> 1) is this expected?
>> 2) or am I missing some extra BBR tuneable?
>> 3) if it is not a regression (I don't have any previous data to compare
>> with), how can I fix this?
>> 4) if it is a bug in BBR, what else should I provide or check for a proper
>> investigation?
>
> I've played with BBR a little bit more and managed to narrow the issue down to
> the changes between v4.12 and v4.13. Here are my observations:
>
> v4.12 + BBR + fq_codel == OK
> v4.12 + BBR + fq   == OK
> v4.13 + BBR + fq_codel == Not OK
> v4.13 + BBR + fq   == OK
>
> I think this has something to do with an internal TCP implementation for
> pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to
> allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the
> throughput is high and saturates the link, but if another qdisc is in use, for
> instance, fq_codel, the throughput drops. Just to be sure, I've also tried
> pfifo_fast instead of fq_codel with the same outcome resulting in the low
> throughput.
>
> Unfortunately, I do not know if this is something expected or should be
> considered as a regression. Thus, asking for an advice.
>
> Ideas?

The way TCP pacing works, it defaults to internal pacing using a hint
stored in the socket.

If you change the qdisc while flow is alive, result could be unexpected.

(TCP socket remembers that one FQ was supposed to handle the pacing)

What results do you have if you use standard pfifo_fast ?

I am asking because TCP pacing relies on High resolution timers, and
that might be weak on your VM.

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Eric Dumazet

Lets CC BBR folks at Google, and remove the ones that probably have no
idea.



On Thu, 2018-02-15 at 21:42 +0100, Oleksandr Natalenko wrote:
> Hello.
> 
> I've faced an issue with a limited TCP bandwidth between my laptop and a 
> server in my 1 Gbps LAN while using BBR as a congestion control mechanism. To 
> verify my observations, I've set up 2 KVM VMs with the following parameters:
> 
> 1) Linux v4.15.3
> 2) virtio NICs
> 3) 128 MiB of RAM
> 4) 2 vCPUs
> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz
> 
> The VMs are interconnected via host bridge (-netdev bridge). I was running 
> iperf3 in the default and reverse mode. Here are the results:
> 
> 1) BBR on both VMs
> 
> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes
> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes
> 
> 2) Reno on both VMs
> 
> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant)
> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant)
> 
> 3) Reno on client, BBR on server
> 
> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant)
> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes
> 
> 4) BBR on client, Reno on server
> 
> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes
> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant)
> 
> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. 
> If 
> using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput to 
> ~100 Mbps (verifiable not only by iperf3, but also by scp while transferring 
> some files between hosts).
> 
> Also, I've tried to use YeAH instead of Reno, and it gives me the same 
> results 
> as Reno (IOW, YeAH works fine too).
> 
> Questions:
> 
> 1) is this expected?
> 2) or am I missing some extra BBR tuneable?
> 3) if it is not a regression (I don't have any previous data to compare 
> with), 
> how can I fix this?
> 4) if it is a bug in BBR, what else should I provide or check for a proper 
> investigation?
> 
> Thanks.
> 
> Regards,
>   Oleksandr
> 
>

Re: [PATCH iproute2-next v5 0/9] ipaddress: Make print_linkinfo_brief() static

2018-02-16 Thread David Ahern

On 2/15/18 2:23 PM, Serhey Popovych wrote:
> With this series I propose to make print_linkinfo_brief() static in
> favor of print_linkinfo() as single point for linkinfo printing.
> 

...

> 
> Thanks,
> Serhii
> 
> Serhey Popovych (9):
>   ipaddress: Abstract IFA_LABEL matching code
>   ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name()
>   utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n()
>   ipaddress: Improve print_linkinfo()
>   ipaddress: Simplify print_linkinfo_brief() and it's usage
>   lib: Correct object file dependencies
>   utils: Introduce and use get_ifname_rta()
>   utils: Introduce and use print_name_and_link() to print name@link
>   ipaddress: Make print_linkinfo_brief() static
> 
>  Makefile |2 +-
>  bridge/link.c|   21 ++---
>  include/ll_map.h |4 +-
>  include/utils.h  |5 ++
>  ip/ip_common.h   |2 -
>  ip/ipaddress.c   |  231 
> ++
>  ip/iplink.c  |5 +-
>  lib/Makefile |4 +-
>  lib/ll_map.c |   31 +---
>  lib/utils.c  |   68 
>  10 files changed, 167 insertions(+), 206 deletions(-)
> 

Series applied after the pretty fixup from a separate patch.

Serhey: please cc me on patches for iproute2-next; I have a separate
email account for netdev mail and at times it gets way behind the netdev
list.

Re: [PATCH RFC 0/4] net: add bpfilter

2018-02-16 Thread Florian Westphal

Florian Westphal  wrote:
> Daniel Borkmann  wrote:
> Several questions spinning at the moment, I will probably come up with
> more:

... and here there are some more ...

One of the many pain points of xtables design is the assumption of 'used
only by sysadmin'.

This has not been true for a very long time, so by now iptables has
this userspace lock (yes, its fugly workaround) to serialize concurrent
iptables invocations in userspace.

AFAIU the translate-in-userspace design now brings back the old problem
of different tools overwriting each others iptables rules.

Another question -- am i correct in that each rule manipulation would
incur a 'recompilation'?  Or are there different mini programs chained
together?

One of the nftables advantages is that (since rule representation in
kernel is black-box from userspace point of view) is that the kernel
can announce add/delete of rules or elements from nftables sets.

Any particular reason why translating iptables rather than nftables
(it should be possible to monitor the nftables changes that are
 announced by kernel and act on those)?

[PATCH v2] ravb: add support for changing MTU

2018-02-16 Thread Niklas Söderlund

Allow for changing the MTU within the limit of the maximum size of a
descriptor (2048 bytes). Add the callback to change MTU from user-space
and take the configurable MTU into account when configuring the
hardware.

Signed-off-by: Niklas Söderlund 
---
 drivers/net/ethernet/renesas/ravb.h  |  1 +
 drivers/net/ethernet/renesas/ravb_main.c | 34 +---
 2 files changed, 28 insertions(+), 7 deletions(-)

* Changes since v1
- Fix spelling error.
- s/le16_to_cpu(rx_desc->ds_cc)/priv->rx_buf_sz/.
- Drop ETH_FCS_LEN + 16 from rx_buf_sz calculation as pointed out by 
  Sergei.

diff --git a/drivers/net/ethernet/renesas/ravb.h 
b/drivers/net/ethernet/renesas/ravb.h
index 96a27b00c90e212a..b81f4faf7b10114d 100644
--- a/drivers/net/ethernet/renesas/ravb.h
+++ b/drivers/net/ethernet/renesas/ravb.h
@@ -1018,6 +1018,7 @@ struct ravb_private {
u32 dirty_rx[NUM_RX_QUEUE]; /* Producer ring indices */
u32 cur_tx[NUM_TX_QUEUE];
u32 dirty_tx[NUM_TX_QUEUE];
+   u32 rx_buf_sz;  /* Based on MTU+slack. */
struct napi_struct napi[NUM_RX_QUEUE];
struct work_struct work;
/* MII transceiver section. */
diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
b/drivers/net/ethernet/renesas/ravb_main.c
index c87f57ca44371586..34e841306e04a3d3 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -238,7 +238,7 @@ static void ravb_ring_free(struct net_device *ndev, int q)
   le32_to_cpu(desc->dptr)))
dma_unmap_single(ndev->dev.parent,
 le32_to_cpu(desc->dptr),
-PKT_BUF_SZ,
+priv->rx_buf_sz,
 DMA_FROM_DEVICE);
}
ring_size = sizeof(struct ravb_ex_rx_desc) *
@@ -300,9 +300,9 @@ static void ravb_ring_format(struct net_device *ndev, int q)
for (i = 0; i < priv->num_rx_ring[q]; i++) {
/* RX descriptor */
rx_desc = >rx_ring[q][i];
-   rx_desc->ds_cc = cpu_to_le16(PKT_BUF_SZ);
+   rx_desc->ds_cc = cpu_to_le16(priv->rx_buf_sz);
dma_addr = dma_map_single(ndev->dev.parent, 
priv->rx_skb[q][i]->data,
- PKT_BUF_SZ,
+ priv->rx_buf_sz,
  DMA_FROM_DEVICE);
/* We just set the data size to 0 for a failed mapping which
 * should prevent DMA from happening...
@@ -346,6 +346,10 @@ static int ravb_ring_init(struct net_device *ndev, int q)
int ring_size;
int i;
 
+   /* +16 gets room from the status from the card. */
+   priv->rx_buf_sz = (ndev->mtu <= 1492 ? PKT_BUF_SZ : ndev->mtu) +
+   ETH_HLEN + VLAN_HLEN;
+
/* Allocate RX and TX skb rings */
priv->rx_skb[q] = kcalloc(priv->num_rx_ring[q],
  sizeof(*priv->rx_skb[q]), GFP_KERNEL);
@@ -355,7 +359,7 @@ static int ravb_ring_init(struct net_device *ndev, int q)
goto error;
 
for (i = 0; i < priv->num_rx_ring[q]; i++) {
-   skb = netdev_alloc_skb(ndev, PKT_BUF_SZ + RAVB_ALIGN - 1);
+   skb = netdev_alloc_skb(ndev, priv->rx_buf_sz + RAVB_ALIGN - 1);
if (!skb)
goto error;
ravb_set_buffer_align(skb);
@@ -586,7 +590,7 @@ static bool ravb_rx(struct net_device *ndev, int *quota, 
int q)
skb = priv->rx_skb[q][entry];
priv->rx_skb[q][entry] = NULL;
dma_unmap_single(ndev->dev.parent, 
le32_to_cpu(desc->dptr),
-PKT_BUF_SZ,
+priv->rx_buf_sz,
 DMA_FROM_DEVICE);
get_ts &= (q == RAVB_NC) ?
RAVB_RXTSTAMP_TYPE_V2_L2_EVENT :
@@ -619,11 +623,12 @@ static bool ravb_rx(struct net_device *ndev, int *quota, 
int q)
for (; priv->cur_rx[q] - priv->dirty_rx[q] > 0; priv->dirty_rx[q]++) {
entry = priv->dirty_rx[q] % priv->num_rx_ring[q];
desc = >rx_ring[q][entry];
-   desc->ds_cc = cpu_to_le16(PKT_BUF_SZ);
+   desc->ds_cc = cpu_to_le16(priv->rx_buf_sz);
 
if (!priv->rx_skb[q][entry]) {
skb = netdev_alloc_skb(ndev,
-  PKT_BUF_SZ + RAVB_ALIGN - 1);
+  priv->rx_buf_sz +
+  RAVB_ALIGN - 1);
if (!skb)
break;  /*

Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read

2018-02-16 Thread Richard Cochran

On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote:
> On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote:
> > _port_ is being used as index to array port_hwtstamp before verifying
> > it is a non-negative number and a valid index at line 209 and 258:
> > 
> > if (port < 0 || port >= mv88e6xxx_num_ports(chip))
> > 
> > Fix this by checking _port_ before using it as index to array
> > port_hwtstamp.
> 
> NAK.   Port is already known to be valid in the callers.

And so the real bug is the pointless range checking tests.  I would
welcome patches to remove those.

Thanks,
Richard

Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read

2018-02-16 Thread Andrew Lunn

On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote:
> On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote:
> > _port_ is being used as index to array port_hwtstamp before verifying
> > it is a non-negative number and a valid index at line 209 and 258:
> > 
> > if (port < 0 || port >= mv88e6xxx_num_ports(chip))
> > 
> > Fix this by checking _port_ before using it as index to array
> > port_hwtstamp.
> 
> NAK.   Port is already known to be valid in the callers.

Then we should take out the check. It is probably this check which is
causing the false positives.

Andrew

Re: [PATCH][next] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift

2018-02-16 Thread Richard Cochran

On Thu, Feb 15, 2018 at 09:27:57PM +0100, Andrew Lunn wrote:
> Do you prefer this, or making timehi and timelo a u64?

The latter.

While you are at it, please move the definition of 'ns' to the start
of the function.

Thanks,
Richard

Re: [RFC][PATCH bpf v2 1/2] bpf: allow 64-bit offsets for bpf function calls

2018-02-16 Thread Naveen N. Rao


Daniel Borkmann wrote:

On 02/15/2018 05:25 PM, Daniel Borkmann wrote:

On 02/13/2018 05:05 AM, Sandipan Das wrote:

The imm field of a bpf_insn is a signed 32-bit integer. For
JIT-ed bpf-to-bpf function calls, it stores the offset from
__bpf_call_base to the start of the callee function.

For some architectures, such as powerpc64, it was found that
this offset may be as large as 64 bits because of which this
cannot be accomodated in the imm field without truncation.

To resolve this, we additionally make aux->func within each
bpf_prog associated with the functions to point to the list
of all function addresses determined by the verifier.

We keep the value assigned to the off field of the bpf_insn
as a way to index into aux->func and also set aux->func_cnt
so that this can be used for performing basic upper bound
checks for the off field.

Signed-off-by: Sandipan Das 
---
v2: Make aux->func point to the list of functions determined
by the verifier rather than allocating a separate callee
list for each function.


Approach looks good to me; do you know whether s390x JIT would
have similar requirement? I think one limitation that would still
need to be addressed later with such approach would be regarding the
xlated prog dump in bpftool, see 'BPF calls via JIT' in 7105e828c087
("bpf: allow for correlation of maps and helpers in dump"). Any
ideas for this (potentially if we could use off + imm for calls,
we'd get to 48 bits, but that seems still not be enough as you say)?


All good points. I'm not really sure how s390x works, so I can't comment 
on that, but I'm copying Michael Holzheu for his consideration.


With the existing scheme, 48 bits won't be enough, so we rejected that 
approach. I can also see how this will be a problem with bpftool, but I 
haven't looked into it in detail. I wonder if we can annotate the output 
to indicate the function being referred to?




One other random thought, although I'm not sure how feasible this
is for ppc64 JIT to realize ... but idea would be to have something
like the below:

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 29ca920..daa7258 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -512,6 +512,11 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long 
*value, char *type,
return ret;
 }

+void * __weak bpf_jit_image_alloc(unsigned long size)
+{
+   return module_alloc(size);
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 unsigned int alignment,
@@ -525,7 +530,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 * random section of illegal instructions.
 */
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
-   hdr = module_alloc(size);
+   hdr = bpf_jit_image_alloc(size);
if (hdr == NULL)
return NULL;

And ppc64 JIT could override bpf_jit_image_alloc() in a similar way
like some archs would override the module_alloc() helper through a
custom implementation, usually via __vmalloc_node_range(), so we
could perhaps fit the range for BPF JITed images in a way that they
could use the 32bit imm in the end? There are not that many progs
loaded typically, so the range could be a bit narrower in such case
anyway. (Not sure if this would work out though, but I thought to
bring it up.)


That'd be a good option to consider. I don't think we want to allocate 
anything from the linear memory range since users could load 
unprivileged BPF programs and consume a lot of memory that way. I doubt 
if we can map vmalloc'ed memory into the 0xc0 address range, but I'm not 
entirely sure.


Michael,
Is the above possible? The question is if we can have BPF programs be 
allocated within 4GB of __bpf_call_base (which is a kernel symbol), so 
that calls to those programs can be encoded in a 32-bit immediate field 
in a BPF instruction. As an extension, we may be able to extend it to 
48-bits by combining with another BPF instruction field (offset). In 
either case, the vmalloc'ed address range won't work.


The alternative is to pass the full 64-bit address of the BPF program in 
an auxiliary field (as proposed in this patch set) but we need to fix it 
up for 'bpftool' as well.


Thanks,
Naveen

Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read

2018-02-16 Thread Richard Cochran

On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote:
> _port_ is being used as index to array port_hwtstamp before verifying
> it is a non-negative number and a valid index at line 209 and 258:
> 
> if (port < 0 || port >= mv88e6xxx_num_ports(chip))
> 
> Fix this by checking _port_ before using it as index to array
> port_hwtstamp.

NAK.   Port is already known to be valid in the callers.

See:

*** net/dsa/slave.c:  dsa_slave_ioctl[266]
*** net/dsa/slave.c:  dsa_skb_tx_timestamp[416]
*** net/dsa/dsa.c:dsa_skb_defer_rx_timestamp[152]
 
> Addresses-Coverity-ID: 1465287 ("Negative array index read")
> Addresses-Coverity-ID: 1465291 ("Negative array index read")

Please check the code before posting.  These false positives are
really annoying.

Thanks,
Richard

Re: [PATCH] inet: don't call skb_orphan if tproxy happens in layer 2

2018-02-16 Thread Gregory Vander Schueren


Hi Florian & Pablo,

Thank your very much for your quick feedback.

On 02/16/2018 12:28 PM, Pablo Neira Ayuso wrote:

On Fri, Feb 16, 2018 at 12:07:06PM +0100, Florian Westphal wrote:

Gregory Vander Schueren  wrote:

[ cc netdev ]


If sysctl bridge-nf-call-iptables is enabled, iptables chains are already
traversed from the bridging code. In such case, tproxy already happened when
reaching ip_rcv. Thus no need to call skb_orphan as this would actually undo
tproxy.


I don't like this because it adds yet another test in fastpath, and for
a use case that has apparently never worked before.


Agreed. I also thought this was not ideal but I did find another way to 
easily fix this.



We noticed issues when using tproxy with net.bridge.bridge-nf-call-iptables
enabled. In such case, ip_rcv() basically undo tproxy's job. The following
patch proposes a fix.


I question wheter its a good idea to mix tproxy with bridges.

Tproxy relies on policy routing, but a bridge doesn't route :-)

I guess you use bridge snat mac mangling to force local delivery of
packets that are otherwise bridged?


Indeed, we use DNAT MAC mangling.


If yes, can you use ebtables brouting instead?
This would bypass the bridge (so no iptables invocation from bridge
prerouting anymore).


We were actually pondering over the usage of MAC DNAT vs brouting. I'll 
thus follow your suggestion and use brouting instead then.



We will try to get rid of nf-call-iptables eventually.


Good to know!


There might be (more complicated) ways to avoid this problem without
adding code in normal network path, but lets check other options first.


Agreed.

If there's a fix for this, that should be away from the fast path, not
in ip_rcv().



--

--
DISCLAIMER.
This email and any files transmitted with it are confidential and intended 
solely for the use of the individual or entity to whom they are addressed. 
If you have received this email in error please notify the system manager. 
This message contains confidential information and is intended only for the 
individual named. If you are not the named addressee you should not 
disseminate, distribute or copy this e-mail. Please notify the sender 
immediately by e-mail if you have received this e-mail by mistake and 
delete this e-mail from your system. If you are not the intended recipient 
you are notified that disclosing, copying, distributing or taking any 
action in reliance on the contents of this information is strictly 
prohibited.

Re: [RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case

2018-02-16 Thread Jesper Dangaard Brouer

On Fri, 16 Feb 2018 13:31:37 +0800
Jason Wang  wrote:

> On 2018年02月16日 06:43, Jesper Dangaard Brouer wrote:
> > The virtio_net code have three different RX code-paths in receive_buf().
> > Two of these code paths can handle XDP, but one of them is broken for
> > at least XDP_REDIRECT.
> >
> > Function(1): receive_big() does not support XDP.
> > Function(2): receive_small() support XDP fully and uses build_skb().
> > Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb().
> >
> > The simple explanation is that receive_mergeable() is broken because
> > it uses napi_alloc_skb(), which violates XDP given XDP assumes packet
> > header+data in single page and enough tail room for skb_shared_info.
> >
> > The longer explaination is that receive_mergeable() tries to
> > work-around and satisfy these XDP requiresments e.g. by having a
> > function xdp_linearize_page() that allocates and memcpy RX buffers
> > around (in case packet is scattered across multiple rx buffers).  This
> > does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because
> > we have not implemented bpf_xdp_adjust_tail yet).
> >
> > The XDP_REDIRECT action combined with cpumap is broken, and cause hard
> > to debug crashes.  The main issue is that the RX packet does not have
> > the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing
> > skb_shared_info to overlap the next packets head-room (in which cpumap
> > stores info).
> >
> > Reproducing depend on the packet payload length and if RX-buffer size
> > happened to have tail-room for skb_shared_info or not.  But to make
> > this even harder to troubleshoot, the RX-buffer size is runtime
> > dynamically change based on an Exponentially Weighted Moving Average
> > (EWMA) over the packet length, when refilling RX rings.
> >
> > This patch only disable XDP_REDIRECT support in receive_mergeable()
> > case, because it can cause a real crash.
> >
> > But IMHO we should NOT support XDP in receive_mergeable() at all,
> > because the principles behind XDP are to gain speed by (1) code
> > simplicity, (2) sacrificing memory and (3) where possible moving
> > runtime checks to setup time.  These principles are clearly being
> > violated in receive_mergeable(), that e.g. runtime track average
> > buffer size to save memory consumption.  
> 
> I agree to disable it for -net now. 

Okay... I'll send an official patch later.

> For net-next, we probably can do:
> 
> - drop xdp_linearize_page() and do XDP through generic XDP helper
>   after skb was built

I disagree strongly here - it makes no sense.

Why do you want to explicit fallback to Generic-XDP?
(... then all the performance gain is gone!)
And besides, a couple of function calls later, the generic XDP code
will/can get invoked anyhow...


Take a step back:
 What is the reason/use-case for implementing XDP inside virtio_net?

>From a DDoS/performance perspective XDP in virtio_net happens on the
"wrong-side" as it is activated _inside_ the guest OS, which is too
late for a DDoS filter, as the guest kick/switch overhead have already
occurred.

I do use XDP_DROP inside the guest (driver virtio_net), but just to
perform what I can zoom-in benchmarking, for perf-record isolating the
early RX code path in the guest.  (Using iptables "raw" table drop is
almost as useful for that purpose).



The XDP ndo_xdp_xmit in tuntap/tun.c (that you also implemented) is
significantly more interesting.  As it allow us to skip large parts of
the network stack and redirect from a physical device (ixgbe) into a
guest device.  Ran a benchmark:
 - 0.5 Mpps with normal code path into device with driver tun
 - 3.7 Mpps with XDP_REDIRECT from ixgbe into same device

Plus, there are indications that 3.7Mpps is not the real limit, as
guest CPU doing XDP_DROP is 75% idle... thus this is a likely a
scheduling + queue size issue.


> - disable EWMA when XDP is set and reserve enough tailroom.
> 
> >
> > Besides the described bug:
> >
> > Update(1): There is also a OOM leak in the XDP_REDIRECT code, which
> > receive_small() is likely also affected by.
> >
> > Update(2): Also observed a guest crash when redirecting out an
> > another virtio_net device, when device is down.  
> 
> Will have a look at these issues. (Holiday in china now, so will do it 
> after).



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: TCP and BBR: reproducibly low cwnd and bandwidth

2018-02-16 Thread Oleksandr Natalenko

Hi, David, Eric, Neal et al.

On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote:
> I've faced an issue with a limited TCP bandwidth between my laptop and a
> server in my 1 Gbps LAN while using BBR as a congestion control mechanism.
> To verify my observations, I've set up 2 KVM VMs with the following
> parameters:
> 
> 1) Linux v4.15.3
> 2) virtio NICs
> 3) 128 MiB of RAM
> 4) 2 vCPUs
> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz
> 
> The VMs are interconnected via host bridge (-netdev bridge). I was running
> iperf3 in the default and reverse mode. Here are the results:
> 
> 1) BBR on both VMs
> 
> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes
> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes
> 
> 2) Reno on both VMs
> 
> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant)
> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant)
> 
> 3) Reno on client, BBR on server
> 
> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant)
> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes
> 
> 4) BBR on client, Reno on server
> 
> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes
> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant)
> 
> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low.
> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput
> to ~100 Mbps (verifiable not only by iperf3, but also by scp while
> transferring some files between hosts).
> 
> Also, I've tried to use YeAH instead of Reno, and it gives me the same
> results as Reno (IOW, YeAH works fine too).
> 
> Questions:
> 
> 1) is this expected?
> 2) or am I missing some extra BBR tuneable?
> 3) if it is not a regression (I don't have any previous data to compare
> with), how can I fix this?
> 4) if it is a bug in BBR, what else should I provide or check for a proper
> investigation?

I've played with BBR a little bit more and managed to narrow the issue down to 
the changes between v4.12 and v4.13. Here are my observations:

v4.12 + BBR + fq_codel == OK
v4.12 + BBR + fq   == OK
v4.13 + BBR + fq_codel == Not OK
v4.13 + BBR + fq   == OK

I think this has something to do with an internal TCP implementation for 
pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to 
allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the 
throughput is high and saturates the link, but if another qdisc is in use, for 
instance, fq_codel, the throughput drops. Just to be sure, I've also tried 
pfifo_fast instead of fq_codel with the same outcome resulting in the low 
throughput.

Unfortunately, I do not know if this is something expected or should be 
considered as a regression. Thus, asking for an advice.

Ideas?

Thanks.

Regards,
  Oleksandr

1 2 >

1 - 100 of 133 matches

Mail list logo