[net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities
This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant to be used for seccomp filters as an alternative to cBPF filters. The program type has relatively limited capabilities in terms of helpers, but that can be extended later on. The eBPF code loading is separated from attachment of the filter, so a privileged user can load the filter, and pass it back to an unprivileged user who can attach it and use it at a later time. In order to attach the filter itself, you need to supply a flag to the seccomp syscall indicating that a eBPF filter is being attached, as opposed to a cBPF one. Verification occurs at program load time, so the user should only receive errors related to attachment. Signed-off-by: Sargun Dhillon--- arch/Kconfig | 8 +++ include/linux/bpf_types.h| 3 + include/linux/seccomp.h | 3 +- include/uapi/linux/bpf.h | 2 + include/uapi/linux/seccomp.h | 7 ++- kernel/bpf/syscall.c | 1 + kernel/seccomp.c | 145 +-- 7 files changed, 147 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 76c0b54443b1..8490d35e59d6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -401,6 +401,14 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. +config SECCOMP_FILTER_EXTENDED + bool "Extended BPF seccomp filters" + depends on SECCOMP_FILTER && BPF_SYSCALL + depends on !CHECKPOINT_RESTORE + help + Enables seccomp filters to be written in eBPF, as opposed + to just cBPF filters. + config HAVE_GCC_PLUGINS bool help diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..945c65c4e461 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #endif +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index c723a5c4e3ff..a7df3ba6cf25 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -5,7 +5,8 @@ #include #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ -SECCOMP_FILTER_FLAG_LOG) +SECCOMP_FILTER_FLAG_LOG | \ +SECCOMP_FILTER_FLAG_EXTENDED) #ifdef CONFIG_SECCOMP diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index db6bdc375126..5f96cb7ed954 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1,3 +1,4 @@ + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * @@ -133,6 +134,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SECCOMP, }; enum bpf_attach_type { diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 2a0bd9dd104d..730af6c7ec2e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -16,10 +16,11 @@ #define SECCOMP_SET_MODE_FILTER1 #define SECCOMP_GET_ACTION_AVAIL 2 -/* Valid flags for SECCOMP_SET_MODE_FILTER */ -#define SECCOMP_FILTER_FLAG_TSYNC 1 -#define SECCOMP_FILTER_FLAG_LOG2 +/* Valid flags for SECCOMP_SET_MODE_FILTER */ +#define SECCOMP_FILTER_FLAG_TSYNC (1 << 0) +#define SECCOMP_FILTER_FLAG_LOG(1 << 1) +#define SECCOMP_FILTER_FLAG_EXTENDED (1 << 2) /* * All BPF programs must return a 32-bit value. * The bottom 16-bits are for optional return data. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..86d6ec8b916d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && + type != BPF_PROG_TYPE_SECCOMP && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 940fa408a288..f8ddc4af1135 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -37,6 +37,7 @@ #include #include #include +#include /** * struct seccomp_filter - container for seccomp BPF programs @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); - /* -* Installing a seccomp filter requires that the task has -* CAP_SYS_ADMIN in its namespace or be running with
[net-next v2 2/2] bpf: Add eBPF seccomp sample programs
This adds a sample program that uses seccomp-eBPF, called seccomp1. It shows the simple ability to code seccomp filters in C. Signed-off-by: Sargun Dhillon--- samples/bpf/Makefile| 5 + samples/bpf/bpf_load.c | 9 +++-- samples/bpf/seccomp1_kern.c | 43 +++ samples/bpf/seccomp1_user.c | 45 + 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 samples/bpf/seccomp1_kern.c create mode 100644 samples/bpf/seccomp1_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ec3fc8d88e87..264838846f71 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp +hostprogs-y += seccomp1 # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o @@ -93,6 +94,8 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o +seccomp1-objs := bpf_load.o $(LIBBPF) seccomp1_user.o + # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -144,6 +147,7 @@ always += xdp_monitor_kern.o always += xdp_rxq_info_kern.o always += xdp2skb_meta_kern.o always += syscall_tp_kern.o +always += seccomp1_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf +HOSTLOADLIBES_seccomp1 += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 69806d74fa53..856bc8b93916 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; + bool is_seccomp = strncmp(event, "seccomp", 7) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SOCK_OPS; } else if (is_sk_skb) { prog_type = BPF_PROG_TYPE_SK_SKB; + } else if (is_seccomp) { + prog_type = BPF_PROG_TYPE_SECCOMP; } else { printf("Unknown event '%s'\n", event); return -1; @@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk || + is_seccomp) return 0; if (is_socket || is_sockops || is_sk_skb) { @@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "socket", 6) == 0 || memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || - memcmp(shname, "sk_skb", 6) == 0) { + memcmp(shname, "sk_skb", 6) == 0 || + memcmp(shname, "seccomp", 7) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/bpf/seccomp1_kern.c b/samples/bpf/seccomp1_kern.c new file mode 100644 index ..420e37eebd92 --- /dev/null +++ b/samples/bpf/seccomp1_kern.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include "bpf_helpers.h" +#include +#include + +#if defined(__x86_64__) +#define ARCH AUDIT_ARCH_X86_64 +#elif defined(__i386__) +#define ARCH AUDIT_ARCH_I386 +#else +#endif + +#ifdef ARCH +/* Returns EPERM when trying to close fd 999 */ +SEC("seccomp") +int bpf_prog1(struct seccomp_data *ctx) +{ + /* +* Make sure this BPF program is being run on the same architecture it +* was compiled on. +*/ + if (ctx->arch != ARCH) + return SECCOMP_RET_ERRNO | EPERM; + if (ctx->nr == __NR_close && ctx->args[0] == 999) + return SECCOMP_RET_ERRNO | EPERM; + + return SECCOMP_RET_ALLOW; +} +#else +#warning Architecture not supported -- Blocking all syscalls +SEC("seccomp") +int
[net-next v2 0/2] eBPF Seccomp filters
This patchset enables seccomp filters to be written in eBPF. Although, this patchset doesn't introduce much of the functionality enabled by eBPF, it lays the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE support in order to utilize eBPF seccomp filters, as eBPF filters cannot be retrieved via the ptrace GET_FILTER API. Any user can load a bpf seccomp filter program, and it can be pinned and reused without requiring access to the bpf syscalls. A user only requires the traditional permissions of either being cap_sys_admin, or have no_new_privs set in order to install their rule. The primary reason for not adding maps support in this patchset is to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS. If we have a map that the BPF program can read, it can potentially "change" privileges after running. It seems like doing writes only is safe, because it can be pure, and side effect free, and therefore not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come to an agreement, this can be in a follow-up patchset. A benchmark of this patchset is as follows for a very standard eBPF filter: Given this test program: for (i = 10; i < ; i++) syscall(__NR_getpid); If I implement an eBPF filter with PROG_ARRAYs with a program per syscall, and tail call, the numbers are such: ebpf JIT 12.3% slower than native ebpf no JIT 13.6% slower than native seccomp JIT 17.6% slower than native seccomp no JIT 37% slower than native The speed of the traditional seccomp filter increases O(n) with the number of syscalls with discrete rulesets, whereas ebpf is O(1), given any number of syscall filters. Changes since v1: * Use a flag to indicate loading an eBPF filter, not a separate command * Remove printk helper * Remove ptrace patch / restore filter / sample * Add some safe helpers Sargun Dhillon (2): bpf, seccomp: Add eBPF filter capabilities bpf: Add eBPF seccomp sample programs arch/Kconfig | 8 +++ include/linux/bpf_types.h| 3 + include/linux/seccomp.h | 3 +- include/uapi/linux/bpf.h | 2 + include/uapi/linux/seccomp.h | 7 ++- kernel/bpf/syscall.c | 1 + kernel/seccomp.c | 145 +-- samples/bpf/Makefile | 5 ++ samples/bpf/bpf_load.c | 9 ++- samples/bpf/seccomp1_kern.c | 43 + samples/bpf/seccomp1_user.c | 45 ++ 11 files changed, 247 insertions(+), 24 deletions(-) create mode 100644 samples/bpf/seccomp1_kern.c create mode 100644 samples/bpf/seccomp1_user.c -- 2.14.1
[PATCH net] xfrm: do not call rcu_read_unlock when afinfo is NULL in xfrm_get_tos
When xfrm_policy_get_afinfo returns NULL, it will not hold rcu read lock. In this case, rcu_read_unlock should not be called in xfrm_get_tos, just like other places where it's calling xfrm_policy_get_afinfo. Fixes: f5e2bb4f5b22 ("xfrm: policy: xfrm_get_tos cannot fail") Signed-off-by: Xin Long--- net/xfrm/xfrm_policy.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078..dd4041f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1458,10 +1458,13 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static int xfrm_get_tos(const struct flowi *fl, int family) { const struct xfrm_policy_afinfo *afinfo; - int tos = 0; + int tos; afinfo = xfrm_policy_get_afinfo(family); - tos = afinfo ? afinfo->get_tos(fl) : 0; + if (!afinfo) + return 0; + + tos = afinfo->get_tos(fl); rcu_read_unlock(); -- 2.1.0
Re: [PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities
On Tue, Feb 13, 2018 at 12:34 PM, Kees Cookwrote: > On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon wrote: >> From: Sargun Dhillon >> >> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant >> to be used for seccomp filters as an alternative to cBPF filters. The >> program type has relatively limited capabilities in terms of helpers, >> but that can be extended later on. >> >> It also introduces a new mechanism to attach these filters via the >> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and >> SECCOMP_SET_MODE_FILTER_EXTENDED respectively. >> >> Signed-off-by: Sargun Dhillon >> --- >> arch/Kconfig | 7 ++ >> include/linux/bpf_types.h| 3 + >> include/uapi/linux/bpf.h | 2 + >> include/uapi/linux/seccomp.h | 15 +++-- >> kernel/bpf/syscall.c | 1 + >> kernel/seccomp.c | 148 >> +-- >> 6 files changed, 150 insertions(+), 26 deletions(-) >> >> diff --git a/arch/Kconfig b/arch/Kconfig >> index 76c0b54443b1..db675888577c 100644 >> --- a/arch/Kconfig >> +++ b/arch/Kconfig >> @@ -401,6 +401,13 @@ config SECCOMP_FILTER >> >> See Documentation/prctl/seccomp_filter.txt for details. >> >> +config SECCOMP_FILTER_EXTENDED >> + bool "Extended BPF seccomp filters" >> + depends on SECCOMP_FILTER && BPF_SYSCALL >> + help >> + Enables seccomp filters to be written in eBPF, as opposed >> + to just cBPF filters. >> + >> config HAVE_GCC_PLUGINS >> bool >> help >> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h >> index 19b8349a3809..945c65c4e461 100644 >> --- a/include/linux/bpf_types.h >> +++ b/include/linux/bpf_types.h >> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) >> #ifdef CONFIG_CGROUP_BPF >> BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) >> #endif >> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED >> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp) >> +#endif >> >> BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) >> BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index db6bdc375126..5f96cb7ed954 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -1,3 +1,4 @@ >> + >> /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ >> /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com >> * >> @@ -133,6 +134,7 @@ enum bpf_prog_type { >> BPF_PROG_TYPE_SOCK_OPS, >> BPF_PROG_TYPE_SK_SKB, >> BPF_PROG_TYPE_CGROUP_DEVICE, >> + BPF_PROG_TYPE_SECCOMP, >> }; >> >> enum bpf_attach_type { >> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h >> index 2a0bd9dd104d..7da8b39f2a6a 100644 >> --- a/include/uapi/linux/seccomp.h >> +++ b/include/uapi/linux/seccomp.h >> @@ -7,14 +7,17 @@ >> >> >> /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, ) */ >> -#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ >> -#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */ >> -#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */ >> +#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ >> +#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */ >> +#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */ >> +#define SECCOMP_MODE_FILTER_EXTENDED 3 /* uses eBPF filter from fd */ > > This MODE flag isn't needed: it's still using a filter, and the > interface changes should be sufficient with > SECCOMP_SET_MODE_FILTER_EXTENDED below. > >> /* Valid operations for seccomp syscall. */ >> -#define SECCOMP_SET_MODE_STRICT0 >> -#define SECCOMP_SET_MODE_FILTER1 >> -#define SECCOMP_GET_ACTION_AVAIL 2 >> +#define SECCOMP_SET_MODE_STRICT0 >> +#define SECCOMP_SET_MODE_FILTER1 >> +#define SECCOMP_GET_ACTION_AVAIL 2 >> +#define SECCOMP_SET_MODE_FILTER_EXTENDED 3 > > It seems like this should be a FILTER flag, not a syscall op change? > >> + >> >> /* Valid flags for SECCOMP_SET_MODE_FILTER */ >> #define SECCOMP_FILTER_FLAG_TSYNC 1 >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index e24aa3241387..86d6ec8b916d 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr) >> >> if (type != BPF_PROG_TYPE_SOCKET_FILTER && >> type != BPF_PROG_TYPE_CGROUP_SKB && >> + type != BPF_PROG_TYPE_SECCOMP && >> !capable(CAP_SYS_ADMIN)) >> return -EPERM; > > So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF? > No, this is specifically so non-init CAP_SYS_ADMIN cal load BPF filters that are either socket_filter, cgroup_skb, or seccomp.
Re: [PATCH V6 2/4] sctp: Add ip option support
On Fri, Feb 16, 2018 at 07:51:02PM -0200, Marcelo Ricardo Leitner wrote: > On Fri, Feb 16, 2018 at 03:14:35PM -0500, Neil Horman wrote: > > On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote: > > > On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote: > > > > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote: > > > > > Add ip option support to allow LSM security modules to utilise > > > > > CIPSO/IPv4 > > > > > and CALIPSO/IPv6 services. > > > > > > > > > > Signed-off-by: Richard Haines> > > > > --- > > > > > include/net/sctp/sctp.h| 4 +++- > > > > > include/net/sctp/structs.h | 2 ++ > > > > > net/sctp/chunk.c | 12 +++- > > > > > net/sctp/ipv6.c| 42 > > > > > +++--- > > > > > net/sctp/output.c | 5 - > > > > > net/sctp/protocol.c| 36 > > > > > net/sctp/socket.c | 14 ++ > > > > > 7 files changed, 97 insertions(+), 18 deletions(-) > > > > > > > > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h > > > > > index f7ae6b0..25c5c87 100644 > > > > > --- a/include/net/sctp/sctp.h > > > > > +++ b/include/net/sctp/sctp.h > > > > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct > > > > > list_head *head) > > > > > static inline int sctp_frag_point(const struct sctp_association > > > > > *asoc, int pmtu) > > > > > { > > > > > struct sctp_sock *sp = sctp_sk(asoc->base.sk); > > > > > + struct sctp_af *af = sp->pf->af; > > > > > int frag = pmtu; > > > > > > > > > > - frag -= sp->pf->af->net_header_len; > > > > > + frag -= af->ip_options_len(asoc->base.sk); > > > > > + frag -= af->net_header_len; > > > > > frag -= sizeof(struct sctphdr) + > > > > > sctp_datachk_len(>stream); > > > > > > > > > > if (asoc->user_frag) > > > > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h > > > > > index 03e92dd..ead5fce 100644 > > > > > --- a/include/net/sctp/structs.h > > > > > +++ b/include/net/sctp/structs.h > > > > > @@ -491,6 +491,7 @@ struct sctp_af { > > > > > void(*ecn_capable)(struct sock *sk); > > > > > __u16 net_header_len; > > > > > int sockaddr_len; > > > > > + int (*ip_options_len)(struct sock *sk); > > > > > sa_family_t sa_family; > > > > > struct list_head list; > > > > > }; > > > > > @@ -515,6 +516,7 @@ struct sctp_pf { > > > > > int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr > > > > > *addr); > > > > > void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); > > > > > void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); > > > > > + void (*copy_ip_options)(struct sock *sk, struct sock *newsk); > > > > > struct sctp_af *af; > > > > > }; > > > > > > > > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c > > > > > index 991a530..d5c0ef7 100644 > > > > > --- a/net/sctp/chunk.c > > > > > +++ b/net/sctp/chunk.c > > > > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct > > > > > sctp_datamsg *msg, struct sctp_chunk *chu > > > > > chunk->msg = msg; > > > > > } > > > > > > > > > > - > > > > > /* A data chunk can have a maximum payload of (2^16 - 20). Break > > > > > * down any such message into smaller chunks. Opportunistically, > > > > > fragment > > > > > * the chunks down to the current MTU constraints. We may get > > > > > refragmented > > > > > @@ -171,6 +170,8 @@ struct sctp_datamsg > > > > > *sctp_datamsg_from_user(struct sctp_association *asoc, > > > > > struct list_head *pos, *temp; > > > > > struct sctp_chunk *chunk; > > > > > struct sctp_datamsg *msg; > > > > > + struct sctp_sock *sp; > > > > > + struct sctp_af *af; > > > > > int err; > > > > > > > > > > msg = sctp_datamsg_new(GFP_KERNEL); > > > > > @@ -189,9 +190,11 @@ struct sctp_datamsg > > > > > *sctp_datamsg_from_user(struct sctp_association *asoc, > > > > > /* This is the biggest possible DATA chunk that can fit into > > > > >* the packet > > > > >*/ > > > > > - max_data = asoc->pathmtu - > > > > > -sctp_sk(asoc->base.sk)->pf->af->net_header_len - > > > > > -sizeof(struct sctphdr) - > > > > > sctp_datachk_len(>stream); > > > > > + sp = sctp_sk(asoc->base.sk); > > > > > + af = sp->pf->af; > > > > > + max_data = asoc->pathmtu - af->net_header_len - > > > > > +sizeof(struct sctphdr) - > > > > > sctp_datachk_len(>stream) - > > > > > +af->ip_options_len(asoc->base.sk); > > > > > max_data = SCTP_TRUNC4(max_data); > > > > > > > > > > /* If the the peer requested that we authenticate DATA chunks > > > > > @@ -211,7 +214,6 @@ struct sctp_datamsg > > > > > *sctp_datamsg_from_user(struct sctp_association *asoc,
[PATCH nf] netfilter: IDLETIMER: be syzkaller friendly
From: Eric DumazetWe had one report from syzkaller [1] First issue is that INIT_WORK() should be done before mod_timer() or we risk timer being fired too soon, even with a 1 second timer. Second issue is that we need to reject too big info->timeout to avoid overflows in msecs_to_jiffies(info->timeout * 1000), or risk looping, if result after overflow is 0. [1] WARNING: CPU: 1 PID: 5129 at kernel/workqueue.c:1444 __queue_work+0xdf4/0x1230 kernel/workqueue.c:1444 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 5129 Comm: syzkaller159866 Not tainted 4.16.0-rc1+ #230 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 panic+0x1e4/0x41c kernel/panic.c:183 __warn+0x1dc/0x200 kernel/panic.c:547 report_bug+0x211/0x2d0 lib/bug.c:184 fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178 fixup_bug arch/x86/kernel/traps.c:247 [inline] do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315 invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:988 RIP: 0010:__queue_work+0xdf4/0x1230 kernel/workqueue.c:1444 RSP: 0018:8801db507538 EFLAGS: 00010006 RAX: 8801aeb46080 RBX: 8801db530200 RCX: 81481404 RDX: 0100 RSI: 86b42640 RDI: 0082 RBP: 8801db507758 R08: 11003b6a0de5 R09: 000c R10: 8801db5073f0 R11: 0020 R12: 11003b6a0eb6 R13: 8801b1067ae0 R14: 01f8 R15: dc00 queue_work_on+0x16a/0x1c0 kernel/workqueue.c:1488 queue_work include/linux/workqueue.h:488 [inline] schedule_work include/linux/workqueue.h:546 [inline] idletimer_tg_expired+0x44/0x60 net/netfilter/xt_IDLETIMER.c:116 call_timer_fn+0x228/0x820 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:541 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:829 RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:777 [inline] RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 [inline] RIP: 0010:_raw_spin_unlock_irqrestore+0x5e/0xba kernel/locking/spinlock.c:184 RSP: 0018:8801c20173c8 EFLAGS: 0282 ORIG_RAX: ff12 RAX: dc00 RBX: 0282 RCX: 0006 RDX: 10d592cd RSI: 110035d68d23 RDI: 0282 RBP: 8801c20173d8 R08: 110038402e47 R09: R10: R11: R12: 8820e5c8 R13: 8801b1067ad8 R14: 8801aea7c268 R15: 8801aea7c278 __debug_object_init+0x235/0x1040 lib/debugobjects.c:378 debug_object_init+0x17/0x20 lib/debugobjects.c:391 __init_work+0x2b/0x60 kernel/workqueue.c:506 idletimer_tg_create net/netfilter/xt_IDLETIMER.c:152 [inline] idletimer_tg_checkentry+0x691/0xb00 net/netfilter/xt_IDLETIMER.c:213 xt_check_target+0x22c/0x7d0 net/netfilter/x_tables.c:850 check_target net/ipv6/netfilter/ip6_tables.c:533 [inline] find_check_entry.isra.7+0x935/0xcf0 net/ipv6/netfilter/ip6_tables.c:575 translate_table+0xf52/0x1690 net/ipv6/netfilter/ip6_tables.c:744 do_replace net/ipv6/netfilter/ip6_tables.c:1160 [inline] do_ip6t_set_ctl+0x370/0x5f0 net/ipv6/netfilter/ip6_tables.c:1686 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115 ipv6_setsockopt+0x10b/0x130 net/ipv6/ipv6_sockglue.c:927 udpv6_setsockopt+0x45/0x80 net/ipv6/udp.c:1422 sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2976 SYSC_setsockopt net/socket.c:1850 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1829 do_syscall_64+0x282/0x940 arch/x86/entry/common.c:287 Fixes: 0902b469bd25 ("netfilter: xtables: idletimer target implementation") Signed-off-by: Eric Dumazet Reported-by: syzkaller --- net/netfilter/xt_IDLETIMER.c |9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 6c2482b709b1eca341926a8393f45dfead358561..1ac6600bfafd60b6b4d5aaf88a41fb57c6ec195b 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -146,11 +146,11 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) timer_setup(>timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; + INIT_WORK(>timer->work, idletimer_tg_work); + mod_timer(>timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); - INIT_WORK(>timer->work,
Re: [RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available
On Fri, 16 Feb 2018 10:11:21 -0800, Sridhar Samudrala wrote: > This patch enables virtio_net to switch over to a VF datapath when a VF > netdev is present with the same MAC address. It allows live migration > of a VM with a direct attached VF without the need to setup a bond/team > between a VF and virtio net device in the guest. > > The hypervisor needs to enable only one datapath at any time so that > packets don't get looped back to the VM over the other datapath. When a VF > is plugged, the virtio datapath link state can be marked as down. The > hypervisor needs to unplug the VF device from the guest on the source host > and reset the MAC filter of the VF to initiate failover of datapath to > virtio before starting the migration. After the migration is completed, > the destination hypervisor sets the MAC filter on the VF and plugs it back > to the guest to switch over to VF datapath. > > When BACKUP feature is enabled, an additional netdev(bypass netdev) is > created that acts as a master device and tracks the state of the 2 lower > netdevs. The original virtio_net netdev is marked as 'backup' netdev and a > passthru device with the same MAC is registered as 'active' netdev. > > This patch is based on the discussion initiated by Jesse on this thread. > https://marc.info/?l=linux-virtualization=151189725224231=2 > > Signed-off-by: Sridhar Samudrala> Signed-off-by: Alexander Duyck > +static void > +virtnet_bypass_get_stats(struct net_device *dev, > + struct rtnl_link_stats64 *stats) > +{ > + struct virtnet_bypass_info *vbi = netdev_priv(dev); > + const struct rtnl_link_stats64 *new; > + struct rtnl_link_stats64 temp; > + struct net_device *child_netdev; > + > + spin_lock(>stats_lock); > + memcpy(stats, >bypass_stats, sizeof(*stats)); > + > + rcu_read_lock(); > + > + child_netdev = rcu_dereference(vbi->active_netdev); > + if (child_netdev) { > + new = dev_get_stats(child_netdev, ); > + virtnet_bypass_fold_stats(stats, new, >active_stats); > + memcpy(>active_stats, new, sizeof(*new)); > + } > + > + child_netdev = rcu_dereference(vbi->backup_netdev); > + if (child_netdev) { > + new = dev_get_stats(child_netdev, ); > + virtnet_bypass_fold_stats(stats, new, >backup_stats); > + memcpy(>backup_stats, new, sizeof(*new)); > + } > + > + rcu_read_unlock(); > + > + memcpy(>bypass_stats, stats, sizeof(*stats)); > + spin_unlock(>stats_lock); > +} > + > +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu) > +{ > + struct virtnet_bypass_info *vbi = netdev_priv(dev); > + struct net_device *child_netdev; > + int ret = 0; > + > + child_netdev = rcu_dereference(vbi->active_netdev); > + if (child_netdev) { > + ret = dev_set_mtu(child_netdev, new_mtu); > + if (ret) > + return ret; > + } > + > + child_netdev = rcu_dereference(vbi->backup_netdev); > + if (child_netdev) { > + ret = dev_set_mtu(child_netdev, new_mtu); > + if (ret) > + netdev_err(child_netdev, > +"Unexpected failure to set mtu to %d\n", > +new_mtu); You should probably unwind if set fails on one of the legs. > + } > + > + dev->mtu = new_mtu; > + return 0; > +} nit: stats, mtu, all those mundane things are implemented in team already. If we had this as kernel-internal team mode we wouldn't have to reimplement them... You probably did investigate that option, for my edification, would you mind saying what the challenges/downsides were? > +static struct net_device * > +get_virtnet_bypass_bymac(struct net *net, const u8 *mac) > +{ > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(net, dev) { > + if (dev->netdev_ops != _bypass_netdev_ops) > + continue; /* not a virtnet_bypass device */ Is there anything inherently wrong with enslaving another virtio dev now? I was expecting something like a hash map to map MAC addr -> master and then one can check if dev is already enslaved to that master. Just a random thought, I'm probably missing something... > + if (ether_addr_equal(mac, dev->perm_addr)) > + return dev; > + } > + > + return NULL; > +} > + > +static struct net_device * > +get_virtnet_bypass_byref(struct net_device *child_netdev) > +{ > + struct net *net = dev_net(child_netdev); > + struct net_device *dev; > + > + ASSERT_RTNL(); > + > + for_each_netdev(net, dev) { > + struct virtnet_bypass_info *vbi; > + > + if (dev->netdev_ops != _bypass_netdev_ops) > + continue; /* not a virtnet_bypass device */ > + > + vbi =
Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote: > Ppatch 2 is in response to the community request for a 3 netdev > solution. However, it creates some issues we'll get into in a moment. > It extends virtio_net to use alternate datapath when available and > registered. When BACKUP feature is enabled, virtio_net driver creates > an additional 'bypass' netdev that acts as a master device and controls > 2 slave devices. The original virtio_net netdev is registered as > 'backup' netdev and a passthru/vf device with the same MAC gets > registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are > associated with the same 'pci' device. The user accesses the network > interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev > as default for transmits when it is available with link up and running. Thank you do doing this. > We noticed a couple of issues with this approach during testing. > - As both 'bypass' and 'backup' netdevs are associated with the same > virtio pci device, udev tries to rename both of them with the same name > and the 2nd rename will fail. This would be OK as long as the first netdev > to be renamed is the 'bypass' netdev, but the order in which udev gets > to rename the 2 netdevs is not reliable. Out of curiosity - why do you link the master netdev to the virtio struct device? FWIW two solutions that immediately come to mind is to export "backup" as phys_port_name of the backup virtio link and/or assign a name to the master like you are doing already. I think team uses team%d and bond uses bond%d, soft naming of master devices seems quite natural in this case. IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio link is quite neat. > - When the 'active' netdev is unplugged OR not present on a destination > system after live migration, the user will see 2 virtio_net netdevs. That's necessary and expected, all configuration applies to the master so master must exist.
[patch net] mlxsw: spectrum_router: Do not unconditionally clear route offload indication
From: Ido SchimmelWhen mlxsw replaces (or deletes) a route it removes the offload indication from the replaced route. This is problematic for IPv4 routes, as the offload indication is stored in the fib_info which is usually shared between multiple routes. Instead of unconditionally clearing the offload indication, only clear it if no other route is using the fib_info. Fixes: 3984d1a89fe7 ("mlxsw: spectrum_router: Provide offload indication using nexthop flags") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Tested-by: Alexander Petrovskiy Signed-off-by: Jiri Pirko --- Dave, please, push to stable. Thanks! --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index dcc6305f7c22..f7948e983637 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3794,6 +3794,9 @@ mlxsw_sp_fib4_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; int i; + if (!list_is_singular(_grp->fib_list)) + return; + for (i = 0; i < nh_grp->count; i++) { struct mlxsw_sp_nexthop *nh = _grp->nexthops[i]; -- 2.14.3
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On pátek 16. února 2018 23:50:35 CET Eric Dumazet wrote: > /* snip */ > If you use > > tcptrace -R test_s2c.pcap > xplot.org d2c_rtt.xpl > > Then you'll see plenty of suspect 40ms rtt samples. That's odd. Even the way how they look uniformly. > It looks like receiver misses wakeups for some reason, > and only the TCP delayed ACK timer is helping. > > So it does not look like a sender side issue to me. To make things even more complicated, I've disabled sg on the server, leaving it enabled on the client: client to server flow: 935 Mbits/sec server to client flow: 72.5 Mbits/sec So still, to me it looks like a sender issue. No?
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 2:50 PM, Oleksandr Natalenkowrote: > Hi. > > On pátek 16. února 2018 21:54:05 CET Eric Dumazet wrote: >> /* snip */ >> Something fishy really : >> /* snip */ >> Not only the receiver suddenly adds a 25 ms delay, but also note that >> it acknowledges all prior segments (ack 112949), but with a wrong ecr >> value ( 2327043753 ) >> instead of 2327043759 >> /* snip */ > > Eric has encouraged me to look closer at what's there in the ethtool, and I've > just had a free time to play with it. I've found out that enabling scatter- > gather (ethtool -K enp3s0 sg on, it is disabled by default on both hosts) > brings the throughput back to normal even with BBR+fq_codel. > > Wh? What's the deal BBR has with sg? Well, no effect here on e1000e (1 Gbit) at least # ethtool -K eth3 sg off Actual changes: scatter-gather: off tx-scatter-gather: off tcp-segmentation-offload: off tx-tcp-segmentation: off [requested on] tx-tcp6-segmentation: off [requested on] generic-segmentation-offload: off [requested on] # tc qd replace dev eth3 root pfifo_fast # ./super_netperf 1 -H 7.7.7.84 -- -K cubic 941 # ./super_netperf 1 -H 7.7.7.84 -- -K bbr 941 # tc qd replace dev eth3 root fq # ./super_netperf 1 -H 7.7.7.84 -- -K cubic 941 # ./super_netperf 1 -H 7.7.7.84 -- -K bbr 941 # tc qd replace dev eth3 root fq_codel # ./super_netperf 1 -H 7.7.7.84 -- -K cubic 941 # ./super_netperf 1 -H 7.7.7.84 -- -K bbr 941 #
[PATCH net 3/3] net: qualcomm: rmnet: Fix possible null dereference in command processing
If a command packet with invalid mux id is received, the packet would not have a valid endpoint. This invalid endpoint maybe dereferenced leading to a crash. Identified by manual code inspection. Fixes: 3352e6c45760 ("net: qualcomm: rmnet: Convert the muxed endpoint to hlist") Signed-off-by: Subash Abhinov Kasiviswanathan--- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c index 6bc328f..b0dbca0 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c @@ -38,6 +38,11 @@ static u8 rmnet_map_do_flow_control(struct sk_buff *skb, } ep = rmnet_get_endpoint(port, mux_id); + if (!ep) { + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + vnd = ep->egress_dev; ip_family = cmd->flow_control.ip_family; -- 1.9.1
[PATCH net 1/3] net: qualcomm: rmnet: Fix crash on real dev unregistration
With CONFIG_DEBUG_PREEMPT enabled, a crash with the following call stack was observed when removing a real dev which had rmnet devices attached to it. To fix this, remove the netdev_upper link APIs and instead use the existing information in rmnet_port and rmnet_priv to get the association between real and rmnet devs. BUG: sleeping function called from invalid context in_atomic(): 0, irqs_disabled(): 0, pid: 5762, name: ip Preemption disabled at: [] debug_object_active_state+0xa4/0x16c Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: PC is at ___might_sleep+0x13c/0x180 LR is at ___might_sleep+0x17c/0x180 [] ___might_sleep+0x13c/0x180 [] __might_sleep+0x58/0x8c [] mutex_lock+0x2c/0x48 [] kernfs_remove_by_name_ns+0x48/0xa8 [] sysfs_remove_link+0x30/0x58 [] __netdev_adjacent_dev_remove+0x14c/0x1e0 [] __netdev_adjacent_dev_unlink_lists+0x40/0x68 [] netdev_upper_dev_unlink+0xb4/0x1fc [] rmnet_dev_walk_unreg+0x6c/0xc8 [] netdev_walk_all_lower_dev_rcu+0x58/0xb4 [] rmnet_config_notify_cb+0xf4/0x134 [] raw_notifier_call_chain+0x58/0x78 [] call_netdevice_notifiers_info+0x48/0x78 [] rollback_registered_many+0x230/0x3c8 [] unregister_netdevice_many+0x38/0x94 [] rtnl_delete_link+0x58/0x88 [] rtnl_dellink+0xbc/0x1cc [] rtnetlink_rcv_msg+0xb0/0x244 [] netlink_rcv_skb+0xb4/0xdc [] rtnetlink_rcv+0x34/0x44 [] netlink_unicast+0x1ec/0x294 [] netlink_sendmsg+0x320/0x390 [] sock_sendmsg+0x54/0x60 [] ___sys_sendmsg+0x298/0x2b0 [] SyS_sendmsg+0xb4/0xf0 [] el0_svc_naked+0x24/0x28 Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Fixes: 60d58f971c10 ("net: qualcomm: rmnet: Implement bridge mode") Signed-off-by: Subash Abhinov Kasiviswanathan--- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 68 +- 1 file changed, 14 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 7e7704d..c494918 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -43,12 +43,6 @@ /* Local Definitions and Declarations */ -struct rmnet_walk_data { - struct net_device *real_dev; - struct list_head *head; - struct rmnet_port *port; -}; - static int rmnet_is_real_dev_registered(const struct net_device *real_dev) { return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler; @@ -112,17 +106,14 @@ static int rmnet_register_real_device(struct net_device *real_dev) static void rmnet_unregister_bridge(struct net_device *dev, struct rmnet_port *port) { - struct net_device *rmnet_dev, *bridge_dev; struct rmnet_port *bridge_port; + struct net_device *bridge_dev; if (port->rmnet_mode != RMNET_EPMODE_BRIDGE) return; /* bridge slave handling */ if (!port->nr_rmnet_devs) { - rmnet_dev = netdev_master_upper_dev_get_rcu(dev); - netdev_upper_dev_unlink(dev, rmnet_dev); - bridge_dev = port->bridge_ep; bridge_port = rmnet_get_port_rtnl(bridge_dev); @@ -132,9 +123,6 @@ static void rmnet_unregister_bridge(struct net_device *dev, bridge_dev = port->bridge_ep; bridge_port = rmnet_get_port_rtnl(bridge_dev); - rmnet_dev = netdev_master_upper_dev_get_rcu(bridge_dev); - netdev_upper_dev_unlink(bridge_dev, rmnet_dev); - rmnet_unregister_real_device(bridge_dev, bridge_port); } } @@ -173,10 +161,6 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, if (err) goto err1; - err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack); - if (err) - goto err2; - port->rmnet_mode = mode; hlist_add_head_rcu(>hlnode, >muxed_ep[mux_id]); @@ -193,8 +177,6 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, return 0; -err2: - rmnet_vnd_dellink(mux_id, port, ep); err1: rmnet_unregister_real_device(real_dev, port); err0: @@ -204,14 +186,13 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, static void rmnet_dellink(struct net_device *dev, struct list_head *head) { + struct rmnet_priv *priv = netdev_priv(dev); struct net_device *real_dev; struct rmnet_endpoint *ep; struct rmnet_port *port; u8 mux_id; - rcu_read_lock(); - real_dev = netdev_master_upper_dev_get_rcu(dev); - rcu_read_unlock(); + real_dev = priv->real_dev; if (!real_dev || !rmnet_is_real_dev_registered(real_dev)) return; @@ -219,7 +200,6 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head) port = rmnet_get_port_rtnl(real_dev); mux_id = rmnet_vnd_get_mux(dev); -
[PATCH net 2/3] net: qualcomm: rmnet: Fix warning seen with 64 bit stats
With CONFIG_DEBUG_PREEMPT enabled, a warning was seen on device creation. This occurs due to the incorrect cpu API usage in ndo_get_stats64 handler. BUG: using smp_processor_id() in preemptible [] code: rmnetcli/5743 caller is debug_smp_processor_id+0x1c/0x24 Call trace: [] dump_backtrace+0x0/0x2a8 [] show_stack+0x20/0x28 [] dump_stack+0xa8/0xe0 [] check_preemption_disabled+0x104/0x108 [] debug_smp_processor_id+0x1c/0x24 [] rmnet_get_stats64+0x64/0x13c [] dev_get_stats+0x68/0xd8 [] rtnl_fill_stats+0x54/0x140 [] rtnl_fill_ifinfo+0x428/0x9cc [] rtmsg_ifinfo_build_skb+0x80/0xf4 [] rtnetlink_event+0x88/0xb4 [] raw_notifier_call_chain+0x58/0x78 [] call_netdevice_notifiers_info+0x48/0x78 [] __netdev_upper_dev_link+0x290/0x5e8 [] netdev_master_upper_dev_link+0x3c/0x48 [] rmnet_newlink+0xf0/0x1c8 [] rtnl_newlink+0x57c/0x6c8 [] rtnetlink_rcv_msg+0xb0/0x244 [] netlink_rcv_skb+0xb4/0xdc [] rtnetlink_rcv+0x34/0x44 [] netlink_unicast+0x1ec/0x294 [] netlink_sendmsg+0x320/0x390 [] sock_sendmsg+0x54/0x60 [] SyS_sendto+0x1a0/0x1e4 [] el0_svc_naked+0x24/0x28 Fixes: 192c4b5d48f2 ("net: qualcomm: rmnet: Add support for 64 bit stats") Signed-off-by: Subash Abhinov Kasiviswanathan--- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index 570a227..346d310 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -121,7 +121,7 @@ static void rmnet_get_stats64(struct net_device *dev, memset(_stats, 0, sizeof(struct rmnet_vnd_stats)); for_each_possible_cpu(cpu) { - pcpu_ptr = this_cpu_ptr(priv->pcpu_stats); + pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu); do { start = u64_stats_fetch_begin_irq(_ptr->syncp); -- 1.9.1
[PATCH net 0/3] net: qualcomm: rmnet: Fix issues with CONFIG_DEBUG_PREEMPT enabled
Patch 1 and 2 fixes issues identified when CONFIG_DEBUG_PREEMPT was enabled. These involve APIs which were called in invalid contexts. Patch 3 is a null derefence fix identified by code inspection. Subash Abhinov Kasiviswanathan (3): net: qualcomm: rmnet: Fix crash on real dev unregistration net: qualcomm: rmnet: Fix warning seen with 64 bit stats net: qualcomm: rmnet: Fix possible null dereference in command processing drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 68 +- .../ethernet/qualcomm/rmnet/rmnet_map_command.c| 5 ++ drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c| 2 +- 3 files changed, 20 insertions(+), 55 deletions(-) -- 1.9.1
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi. On pátek 16. února 2018 21:54:05 CET Eric Dumazet wrote: > /* snip */ > Something fishy really : > /* snip */ > Not only the receiver suddenly adds a 25 ms delay, but also note that > it acknowledges all prior segments (ack 112949), but with a wrong ecr > value ( 2327043753 ) > instead of 2327043759 > /* snip */ Eric has encouraged me to look closer at what's there in the ethtool, and I've just had a free time to play with it. I've found out that enabling scatter- gather (ethtool -K enp3s0 sg on, it is disabled by default on both hosts) brings the throughput back to normal even with BBR+fq_codel. Wh? What's the deal BBR has with sg? Oleksandr
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, 2018-02-16 at 12:54 -0800, Eric Dumazet wrote: > On Fri, Feb 16, 2018 at 9:25 AM, Oleksandr Natalenko >wrote: > > Hi. > > > > On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote: > > > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We > > > have not run into this one in our team, but we will try to work with you > > > to > > > fix this. > > > > > > Would you be able to take a sender-side tcpdump trace of the slow BBR > > > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would > > > be > > > fine. Maybe something like: > > > > > > tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT > > > > So, going on with two real HW hosts. They are both running latest stock Arch > > Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are > > interconnected with 1 Gbps link (via switch if that matters). Using iperf3, > > running each test for 20 seconds. > > > > Having BBR+fq_codel (or pfifo_fast, same result) on both hosts: > > > > Client to server: 112 Mbits/sec > > Server to client: 96.1 Mbits/sec > > > > Having BBR+fq on both hosts: > > > > Client to server: 347 Mbits/sec > > Server to client: 397 Mbits/sec > > > > Having YeAH+fq on both hosts: > > [1] https://natalenko.name/myfiles/bbr/ > > > > Something fishy really : > > 09:18:31.449903 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], > seq 76745:79641, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508870], length 2896 > 09:18:31.449916 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], > ack 79641, win 1011, options [nop,nop,TS val 3190508870 ecr > 2327043753], length 0 > 09:18:31.449925 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 79641:83985, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508870], length 4344 > 09:18:31.449936 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], > ack 83985, win 987, options [nop,nop,TS val 3190508870 ecr > 2327043753], length 0 > 09:18:31.450112 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 83985:86881, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508870], length 2896 > 09:18:31.450124 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], > ack 86881, win 971, options [nop,nop,TS val 3190508871 ecr > 2327043753], length 0 > 09:18:31.450299 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 86881:91225, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508870], length 4344 > 09:18:31.450313 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], > ack 91225, win 947, options [nop,nop,TS val 3190508871 ecr > 2327043753], length 0 > 09:18:31.450491 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], > seq 91225:92673, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508870], length 1448 > 09:18:31.450505 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 92673:94121, ack 38, win 227, options [nop,nop,TS val 2327043753 > ecr 3190508871], length 1448 > 09:18:31.450511 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], > seq 94121:95569, ack 38, win 227, options [nop,nop,TS val 2327043754 > ecr 3190508871], length 1448 > 09:18:31.450720 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 95569:101361, ack 38, win 227, options [nop,nop,TS val 2327043754 > ecr 3190508871], length 5792 > 09:18:31.450932 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 101361:105705, ack 38, win 227, options [nop,nop,TS val 2327043754 > ecr 3190508871], length 4344 > 09:18:31.451132 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 105705:110049, ack 38, win 227, options [nop,nop,TS val 2327043754 > ecr 3190508871], length 4344 > 09:18:31.451342 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 110049:111497, ack 38, win 227, options [nop,nop,TS val 2327043754 > ecr 3190508871], length 1448 > 09:18:31.455841 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], > seq 111497:112945, ack 38, win 227, options [nop,nop,TS val 2327043759 > ecr 3190508871], length 1448 > > Not only the receiver suddenly adds a 25 ms delay, but also note that > it acknowledges all prior segments (ack 112949), but with a wrong ecr > value ( 2327043753 ) > instead of 2327043759 If you use tcptrace -R test_s2c.pcap xplot.org d2c_rtt.xpl Then you'll see plenty of suspect 40ms rtt samples. It looks like receiver misses wakeups for some reason, and only the TCP delayed ACK timer is helping. So it does not look like a sender side issue to me.
Re: [PATCH] tun: fix mismatch in mutex lock-unlock in tun_get_user()
On Fri, Feb 16, 2018 at 2:11 PM, Alexey Khoroshilovwrote: > There is a single error path where tfile->napi_mutex is left unlocked. > It can lead to a deadlock. > > Found by Linux Driver Verification project (linuxtesting.org). > > Signed-off-by: Alexey Khoroshilov > --- > drivers/net/tun.c | 4 > 1 file changed, 4 insertions(+) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 81e6cc951e7f..0072a9832532 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1879,6 +1879,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, > struct tun_file *tfile, > default: > this_cpu_inc(tun->pcpu_stats->rx_dropped); > kfree_skb(skb); > + if (frags) { > + tfile->napi.skb = NULL; > + mutex_unlock(>napi_mutex); > + } > return -EINVAL; I do not believe this can happen for IFF_TUN IFF_NAPI_FRAGS can only be set for IFF_TAP
Re: [PATCH RFC 0/4] net: add bpfilter
From: Florian WestphalDate: Fri, 16 Feb 2018 17:14:08 +0100 > Any particular reason why translating iptables rather than nftables > (it should be possible to monitor the nftables changes that are > announced by kernel and act on those)? As Daniel said, iptables is by far the most deployed of the two technologies. Therefore it provides the largest environment for testing and coverage.
Re: [PATCH RFC 0/4] net: add bpfilter
From: Florian WestphalDate: Fri, 16 Feb 2018 15:57:27 +0100 > 4. Do you plan to reimplement connection tracking in userspace? > If no, how will the bpf program interact with it? The natural way to handle this, as with anything BPF related, is with appropriate BPF helpers which would be added for this purpose.
[PATCH] tun: fix mismatch in mutex lock-unlock in tun_get_user()
There is a single error path where tfile->napi_mutex is left unlocked. It can lead to a deadlock. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov--- drivers/net/tun.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 81e6cc951e7f..0072a9832532 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1879,6 +1879,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, default: this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(>napi_mutex); + } return -EINVAL; } } -- 2.7.4
Re: [PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0
On 2/16/18 2:43 PM, David Miller wrote: > From: David Ahern> Date: Fri, 16 Feb 2018 11:03:03 -0800 > >> Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings >> if the index is actually set in the message. >> >> Signed-off-by: David Ahern > > Ok, this behavior meets reasonable expectations, applied, thanks. > > None of the documation is clear about this relationship between > ip_pktinfo's ifindex and settings made by SO_BINDTODEVICE. > It is my understanding that SO_BINDTODEVICE is the strongest -- it requires admin to set. From there IP_PKTINFO and IP_UNICAST_IF are non-root options and hence weaker. If that is the proper expectation, then the right thing to do is probably to error out if ipc.oif is already set. I was concerned that would break existing apps, so this seemed to be a compromise.
Re: [PATCH net-next] tun: export flags, uid, gid, queue information over netlink
From: Sabrina DubrocaDate: Fri, 16 Feb 2018 11:03:07 +0100 > Signed-off-by: Sabrina Dubroca > Reviewed-by: Stefano Brivio Looks good, applied, thanks Sabrina.
Re: [PATCH V6 2/4] sctp: Add ip option support
On Fri, Feb 16, 2018 at 03:14:35PM -0500, Neil Horman wrote: > On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote: > > On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote: > > > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote: > > > > Add ip option support to allow LSM security modules to utilise > > > > CIPSO/IPv4 > > > > and CALIPSO/IPv6 services. > > > > > > > > Signed-off-by: Richard Haines> > > > --- > > > > include/net/sctp/sctp.h| 4 +++- > > > > include/net/sctp/structs.h | 2 ++ > > > > net/sctp/chunk.c | 12 +++- > > > > net/sctp/ipv6.c| 42 > > > > +++--- > > > > net/sctp/output.c | 5 - > > > > net/sctp/protocol.c| 36 > > > > net/sctp/socket.c | 14 ++ > > > > 7 files changed, 97 insertions(+), 18 deletions(-) > > > > > > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h > > > > index f7ae6b0..25c5c87 100644 > > > > --- a/include/net/sctp/sctp.h > > > > +++ b/include/net/sctp/sctp.h > > > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct > > > > list_head *head) > > > > static inline int sctp_frag_point(const struct sctp_association *asoc, > > > > int pmtu) > > > > { > > > > struct sctp_sock *sp = sctp_sk(asoc->base.sk); > > > > + struct sctp_af *af = sp->pf->af; > > > > int frag = pmtu; > > > > > > > > - frag -= sp->pf->af->net_header_len; > > > > + frag -= af->ip_options_len(asoc->base.sk); > > > > + frag -= af->net_header_len; > > > > frag -= sizeof(struct sctphdr) + > > > > sctp_datachk_len(>stream); > > > > > > > > if (asoc->user_frag) > > > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h > > > > index 03e92dd..ead5fce 100644 > > > > --- a/include/net/sctp/structs.h > > > > +++ b/include/net/sctp/structs.h > > > > @@ -491,6 +491,7 @@ struct sctp_af { > > > > void(*ecn_capable)(struct sock *sk); > > > > __u16 net_header_len; > > > > int sockaddr_len; > > > > + int (*ip_options_len)(struct sock *sk); > > > > sa_family_t sa_family; > > > > struct list_head list; > > > > }; > > > > @@ -515,6 +516,7 @@ struct sctp_pf { > > > > int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr > > > > *addr); > > > > void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); > > > > void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); > > > > + void (*copy_ip_options)(struct sock *sk, struct sock *newsk); > > > > struct sctp_af *af; > > > > }; > > > > > > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c > > > > index 991a530..d5c0ef7 100644 > > > > --- a/net/sctp/chunk.c > > > > +++ b/net/sctp/chunk.c > > > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct sctp_datamsg > > > > *msg, struct sctp_chunk *chu > > > > chunk->msg = msg; > > > > } > > > > > > > > - > > > > /* A data chunk can have a maximum payload of (2^16 - 20). Break > > > > * down any such message into smaller chunks. Opportunistically, > > > > fragment > > > > * the chunks down to the current MTU constraints. We may get > > > > refragmented > > > > @@ -171,6 +170,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > > sctp_association *asoc, > > > > struct list_head *pos, *temp; > > > > struct sctp_chunk *chunk; > > > > struct sctp_datamsg *msg; > > > > + struct sctp_sock *sp; > > > > + struct sctp_af *af; > > > > int err; > > > > > > > > msg = sctp_datamsg_new(GFP_KERNEL); > > > > @@ -189,9 +190,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > > sctp_association *asoc, > > > > /* This is the biggest possible DATA chunk that can fit into > > > > * the packet > > > > */ > > > > - max_data = asoc->pathmtu - > > > > - sctp_sk(asoc->base.sk)->pf->af->net_header_len - > > > > - sizeof(struct sctphdr) - > > > > sctp_datachk_len(>stream); > > > > + sp = sctp_sk(asoc->base.sk); > > > > + af = sp->pf->af; > > > > + max_data = asoc->pathmtu - af->net_header_len - > > > > + sizeof(struct sctphdr) - > > > > sctp_datachk_len(>stream) - > > > > + af->ip_options_len(asoc->base.sk); > > > > max_data = SCTP_TRUNC4(max_data); > > > > > > > > /* If the the peer requested that we authenticate DATA chunks > > > > @@ -211,7 +214,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > > sctp_association *asoc, > > > > > > > > /* Set first_len and then account for possible bundles on first > > > > frag */ > > > > first_len = max_data; > > > > - > > > > /* Check to see if we have a
Re: [PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0
From: David AhernDate: Fri, 16 Feb 2018 11:03:03 -0800 > Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings > if the index is actually set in the message. > > Signed-off-by: David Ahern Ok, this behavior meets reasonable expectations, applied, thanks. None of the documation is clear about this relationship between ip_pktinfo's ifindex and settings made by SO_BINDTODEVICE.
Re: [PATCH][V2] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift
From: Colin KingDate: Fri, 16 Feb 2018 16:55:05 + > From: Colin Ian King > > The shifting of timehi by 16 bits to the left will be promoted to > a 32 bit signed int and then sign-extended to an u64. If the top bit > of timehi is set then all then all the upper bits of ns end up as also > being set because of the sign-extension. Fix this by making timehi and > timelo u64. Also move the declaration of ns. > > Detected by CoverityScan, CID#1465288 ("Unintended sign extension") > > Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support") > Signed-off-by: Colin Ian King Please indicate the appropriate target tree in your Subject lines in the future, for this it should be "[PATCH net-next]". Applied, thanks.
Re: [PATCH v2] ravb: add support for changing MTU
From: Niklas SöderlundDate: Fri, 16 Feb 2018 17:10:08 +0100 > Allow for changing the MTU within the limit of the maximum size of a > descriptor (2048 bytes). Add the callback to change MTU from user-space > and take the configurable MTU into account when configuring the > hardware. > > Signed-off-by: Niklas Söderlund Applied. However, most drivers support MTU change while the interface is up and frankly that is what is most useful for users.
Re: [RFC PATCH] ptr_ring: linked list fallback
From: "Michael S. Tsirkin"Date: Fri, 16 Feb 2018 09:40:54 +0200 > So pointer rings work fine, but they have a problem: > make them too small and not enough entries fit. > Make them too large and you start flushing your cache > and running out of memory. > > This is a new idea of mine: a ring backed by a > linked list. Once you run out of rin entries, > instead of a drop you fall back on a list with > a common lock. > > Should work well for the case where the ring is typically sized > correctly, but will help address the fact that some user try to set e.g. > tx queue length to 100. > > My hope this will move us closer to direction where e.g. fw codel can > use ptr rings without locking at all. > The API is still very rough, and I really need to take a hard look > at lock nesting. > > Completely untested, sending for early feedback/flames. > > Signed-off-by: Michael S. Tsirkin So the idea is that if a user sets a really huge TX queue length, we allocate a ptr_ring which is smaller, and use the backup linked list when necessary to provide the requested TX queue length legitimately. Right?
Re: [PATCH net] sctp: remove the left unnecessary check for chunk in sctp_renege_events
From: Xin LongDate: Fri, 16 Feb 2018 17:18:33 +0800 > Commit fb23403536ea ("sctp: remove the useless check in > sctp_renege_events") forgot to remove another check for > chunk in sctp_renege_events. > > Dan found this when doing a static check. > > This patch is to remove that check, and also to merge > two checks into one 'if statement'. > > Fixes: fb23403536ea ("sctp: remove the useless check in sctp_renege_events") > Reported-by: Dan Carpenter > Signed-off-by: Xin Long Applied, thank you.
Re: [PATCH net] tg3: APE heartbeat changes
From: Satish BaddipadigeDate: Fri, 16 Feb 2018 10:01:29 +0530 > @@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, > int kind) > tg3_ape_send_event(tp, event); > } > > +static inline void tg3_send_ape_heartbeat(struct tg3 *tp, Inline functions are not appropriate in foo.c files, please drop the inline keyword and let the compiler device.
Re: [PATCH net-next 0/2] nfp: whitespace sync and flower TCP flags
From: Jakub KicinskiDate: Thu, 15 Feb 2018 20:19:07 -0800 > Whitespace cleanup from Michael and flower offload support for matching > on TCP flags from Pieter. Series applied, thanks Jakub.
Re: [PATCH net] rxrpc: Work around usercopy check
From: David HowellsDate: Thu, 15 Feb 2018 22:59:00 + > Due to a check recently added to copy_to_user(), it's now not permitted to > copy from slab-held data to userspace unless the slab is whitelisted. This > affects rxrpc_recvmsg() when it attempts to place an RXRPC_USER_CALL_ID > control message in the userspace control message buffer. A warning is > generated by usercopy_warn() because the source is the copy of the > user_call_ID retained in the rxrpc_call struct. > > Work around the issue by copying the user_call_ID to a variable on the > stack and passing that to put_cmsg(). > > The warning generated looks like: ... > Reported-by: Jonathan Billings > Signed-off-by: David Howells > Acked-by: Kees Cook > Tested-by: Jonathan Billings Applied, thanks David.
Re: [PATCH net] tun: fix tun_napi_alloc_frags() frag allocator
From: Eric DumazetDate: Thu, 15 Feb 2018 14:47:15 -0800 > From: Eric Dumazet > > > While fuzzing arm64 v4.16-rc1 with Syzkaller, I've been hitting a > misaligned atomic in __skb_clone: > > atomic_inc(&(skb_shinfo(skb)->dataref)); > >where dataref doesn't have the required natural alignment, and the >atomic operation faults. e.g. i often see it aligned to a single >byte boundary rather than a four byte boundary. > >AFAICT, the skb_shared_info is misaligned at the instant it's >allocated in __napi_alloc_skb() __napi_alloc_skb() > > > Problem is caused by tun_napi_alloc_frags() using > napi_alloc_frag() with user provided seg sizes, > leading to other users of this API getting unaligned > page fragments. > > Since we would like to not necessarily add paddings or alignments to > the frags that tun_napi_alloc_frags() attaches to the skb, switch to > another page frag allocator. > > As a bonus skb_page_frag_refill() can use GFP_KERNEL allocations, > meaning that we can not deplete memory reserves as easily. > > Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") > Signed-off-by: Eric Dumazet > Reported-by: Mark Rutland > Tested-by: Mark Rutland Applied and queued up for -stable, thanks Eric.
Re: [PATCH v3 net-next 0/7] RDS: zerocopy support
From: Sowmini VaradhanDate: Thu, 15 Feb 2018 10:49:31 -0800 > This is version 3 of the series, following up on review comments for > http://patchwork.ozlabs.org/project/netdev/list/?series=28530 > > Review comments addressed > Patch 4 > - fix fragile use of skb->cb[], do not set ee_code incorrectly. > Patch 5: > - remove needless bzero of skb->cb[], consolidate err cleanup > > A brief overview of this feature follows. > > This patch series provides support for MSG_ZERCOCOPY > on a PF_RDS socket based on the APIs and infrastructure added > by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY") > > For single threaded rds-stress testing using rds-tcp with the > ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary > results show that there is a significant reduction in latency: about > 90 usec with zerocopy, compared with 200 usec without zerocopy. > > This patchset modifies the above for zerocopy in the following manner. > - if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and, > - if the SO_ZEROCOPY socket option has been set on the PF_RDS socket, > application pages sent down with rds_sendmsg are pinned. The pinning > uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit > on MSG_ZEROCOPY pages"). The message is unpinned when all references > to the message go down to 0, and the message is freed by rds_message_purge. > > A multithreaded application using this infrastructure must send down > a unique 32 bit cookie as ancillary data with each sendmsg invocation. > The format of this ancillary data is described in Patch 5 of the series. > The cookie is passed up to the application on the sk_error_queue when > the message is unpinned, indicating to the application that it is now > safe to free/reuse the message buffer. The details of the completion > notification are provided in Patch 4 of this series. Series applied, thanks a lot!
Re: [PATCH v1 1/1] drivers: isdn: NULL pointer dereference [null-pointer-deref] (CWE 476) problem
From: Joe MoriartyDate: Thu, 15 Feb 2018 15:27:00 -0500 > The Parfait (version 2.1.0) static code analysis tool found the > following NULL pointer dereference problem. > > - drivers/isdn/mISDN/core.c > function channelmap_show() does not check the returned mdev > variable from dev_to_mISDN() for NULL. Added the check for > NULL, which results in a value of 0 being returned. > > Signed-off-by: Joe Moriarty > Reviewed-by: Jonathan Helman Since the lifetime for the sysfs files for these devices is strictly smaller than the lifetime of the 'dev' object and it's driver data, it is not possible for mdev to be NULL in this situation. I understand what the static checker is trying to do, but within context this lack of a NULL check is legitimate. I'm not applying this, sorry.
Re: [PATCH net] udplite: fix partial checksum initialization
From: Alexey KodanevDate: Thu, 15 Feb 2018 20:18:43 +0300 > Since UDP-Lite is always using checksum, the following path is > triggered when calculating pseudo header for it: > > udp4_csum_init() or udp6_csum_init() > skb_checksum_init_zero_check() > __skb_checksum_validate_complete() > > The problem can appear if skb->len is less than CHECKSUM_BREAK. In > this particular case __skb_checksum_validate_complete() also invokes > __skb_checksum_complete(skb). If UDP-Lite is using partial checksum > that covers only part of a packet, the function will return bad > checksum and the packet will be dropped. > > It can be fixed if we skip skb_checksum_init_zero_check() and only > set the required pseudo header checksum for UDP-Lite with partial > checksum before udp4_csum_init()/udp6_csum_init() functions return. > > Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4") > Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6") > Signed-off-by: Alexey Kodanev Applied and queued up for -stable, thanks Alexey.
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 9:25 AM, Oleksandr Natalenkowrote: > Hi. > > On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote: >> Thanks for the detailed report! Yes, this sounds like an issue in BBR. We >> have not run into this one in our team, but we will try to work with you to >> fix this. >> >> Would you be able to take a sender-side tcpdump trace of the slow BBR >> transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be >> fine. Maybe something like: >> >> tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT > > So, going on with two real HW hosts. They are both running latest stock Arch > Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are > interconnected with 1 Gbps link (via switch if that matters). Using iperf3, > running each test for 20 seconds. > > Having BBR+fq_codel (or pfifo_fast, same result) on both hosts: > > Client to server: 112 Mbits/sec > Server to client: 96.1 Mbits/sec > > Having BBR+fq on both hosts: > > Client to server: 347 Mbits/sec > Server to client: 397 Mbits/sec > > Having YeAH+fq on both hosts: > [1] https://natalenko.name/myfiles/bbr/ > Something fishy really : 09:18:31.449903 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], seq 76745:79641, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508870], length 2896 09:18:31.449916 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 79641, win 1011, options [nop,nop,TS val 3190508870 ecr 2327043753], length 0 09:18:31.449925 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 79641:83985, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508870], length 4344 09:18:31.449936 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 83985, win 987, options [nop,nop,TS val 3190508870 ecr 2327043753], length 0 09:18:31.450112 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 83985:86881, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508870], length 2896 09:18:31.450124 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 86881, win 971, options [nop,nop,TS val 3190508871 ecr 2327043753], length 0 09:18:31.450299 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 86881:91225, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508870], length 4344 09:18:31.450313 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 91225, win 947, options [nop,nop,TS val 3190508871 ecr 2327043753], length 0 09:18:31.450491 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], seq 91225:92673, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508870], length 1448 09:18:31.450505 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 92673:94121, ack 38, win 227, options [nop,nop,TS val 2327043753 ecr 3190508871], length 1448 09:18:31.450511 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [P.], seq 94121:95569, ack 38, win 227, options [nop,nop,TS val 2327043754 ecr 3190508871], length 1448 09:18:31.450720 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 95569:101361, ack 38, win 227, options [nop,nop,TS val 2327043754 ecr 3190508871], length 5792 09:18:31.450932 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 101361:105705, ack 38, win 227, options [nop,nop,TS val 2327043754 ecr 3190508871], length 4344 09:18:31.451132 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 105705:110049, ack 38, win 227, options [nop,nop,TS val 2327043754 ecr 3190508871], length 4344 09:18:31.451342 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 110049:111497, ack 38, win 227, options [nop,nop,TS val 2327043754 ecr 3190508871], length 1448 09:18:31.455841 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 111497:112945, ack 38, win 227, options [nop,nop,TS val 2327043759 ecr 3190508871], length 1448 Not only the receiver suddenly adds a 25 ms delay, but also note that it acknowledges all prior segments (ack 112949), but with a wrong ecr value ( 2327043753 ) instead of 2327043759 09:18:31.482238 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 112945, win , options [nop,nop,TS val 3190508903 ecr 2327043753], length 0 09:18:31.482704 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 112945:114393, ack 38, win 227, options [nop,nop,TS val 2327043786 ecr 3190508903], length 1448 09:18:31.482734 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 114393, win 1134, options [nop,nop,TS val 3190508903 ecr 2327043786], length 0 09:18:31.482802 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 114393:117289, ack 38, win 227, options [nop,nop,TS val 2327043786 ecr 3190508903], length 2896 09:18:31.482813 IP 172.29.28.55.14936 > 172.29.28.1.5201: Flags [.], ack 117289, win 1179, options [nop,nop,TS val 3190508903 ecr 2327043786], length 0 09:18:31.483138 IP 172.29.28.1.5201 > 172.29.28.55.14936: Flags [.], seq 117289:120185, ack 38, win 227, options [nop,nop,TS val 2327043786 ecr 3190508903], length 2896 09:18:31.483158 IP 172.29.28.55.14936
[PATCH] skbuff: Fix comment mis-spelling.
'peform' --> 'perform' Signed-off-by: David S. Miller--- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720..c1e66bdcf583 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3646,7 +3646,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, return true; } -/* For small packets <= CHECKSUM_BREAK peform checksum complete directly +/* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 -- 2.14.3
Re: [PATCH net] dn_getsockoptdecnet: move nf_{get/set}sockopt outside sock lock
From: Paolo AbeniDate: Thu, 15 Feb 2018 16:59:49 +0100 > After commit 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock > only in the required scope"), the caller of nf_{get/set}sockopt() must > not hold any lock, but, in such changeset, I forgot to cope with DECnet. > > This commit addresses the issue moving the nf call outside the lock, > in the dn_{get,set}sockopt() with the same schema currently used by > ipv4 and ipv6. Also moves the unhandled sockopts of the end of the main > switch statements, to improve code readability. > > Reported-by: Petr Vandrovec > BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=198791#c2 > Fixes: 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock only in the > required scope") > Signed-off-by: Paolo Abeni Applied, thank you.
Re: [PATCHv3 net-next 0/8] net: sched: act: add extack support
From: Alexander AringDate: Thu, 15 Feb 2018 10:54:52 -0500 > this patch series adds extack support for the TC action subsystem. > As example I for the extack support in a TC action I choosed mirred > action. > > - Alex > > Cc: David Ahern > > changes since v3: > - adapt recommended changes from Davide Caratti, please check if > I catch everything. Thanks. > > changes since v2: > > - remove newline in extack of generic walker handling > Thanks to Davide Caratti > - add ker...@mojatatu.com in cc Series applied, thanks.
Re: [PATCH RFC 0/4] net: add bpfilter
Hi Florian, On 02/16/2018 05:14 PM, Florian Westphal wrote: > Florian Westphalwrote: >> Daniel Borkmann wrote: >> Several questions spinning at the moment, I will probably come up with >> more: > > ... and here there are some more ... > > One of the many pain points of xtables design is the assumption of 'used > only by sysadmin'. > > This has not been true for a very long time, so by now iptables has > this userspace lock (yes, its fugly workaround) to serialize concurrent > iptables invocations in userspace. > > AFAIU the translate-in-userspace design now brings back the old problem > of different tools overwriting each others iptables rules. Right, so the behavior would need to be adapted to be exactly the same, given all the requests go into kernel space first via the usual uapis, I don't think there would be anything in the way of keeping that as is. > Another question -- am i correct in that each rule manipulation would > incur a 'recompilation'? Or are there different mini programs chained > together? Right now in the PoC yes, basically it regenerates the program on the fly in gen.c when walking the struct bpfilter_ipt_ip's and appends the entries to the program, but it doesn't have to be that way. There are multiple options to allow for a partial code generation, e.g. via chaining tail call arrays or directly via BPF to BPF calls eventually, there would be few changes on BPF side needed, but it can be done; there could additionally be various optimizations passes during code generation phase performed while keeping given constraints in order to speed up getting to a verdict. > One of the nftables advantages is that (since rule representation in > kernel is black-box from userspace point of view) is that the kernel > can announce add/delete of rules or elements from nftables sets. > > Any particular reason why translating iptables rather than nftables > (it should be possible to monitor the nftables changes that are > announced by kernel and act on those)? Yeah, correct, this should be possible as well. We started out with the iptables part in the demo as the majority of bigger infrastructure projects all still rely heavily on it (e.g. docker, k8s to just name two big ones). Usually they have their requests to iptables baked into their code directly which probably won't change any time soon, so thought was that they could benefit initially from it once there would be sufficient coverage. Thanks, Daniel
Re: [PATCH net] net: sched: fix unbalance in the error path of tca_action_flush()
From: Davide CarattiDate: Thu, 15 Feb 2018 15:50:57 +0100 > When tca_action_flush() calls the action walk() and gets an error, > a successful call to nla_nest_start() is not followed by a call to > nla_nest_cancel(). It's harmless, as the skb is freed in the error > path - but it's worth to fix this unbalance. > > Signed-off-by: Davide Caratti Applied to net-next, thanks.
Re: [PATCH] PCI/cxgb4: Extend T3 PCI quirk to T4+ devices
From: Ganesh GoudarDate: Thu, 15 Feb 2018 20:03:18 +0530 > From: Casey Leedom > > We've run into a problem where our device is attached > to a Virtual Machine and the use of the new pci_set_vpd_size() > API doesn't help. The VM kernel has been informed that > the accesses are okay, but all of the actual VPD Capability > Accesses are trapped down into the KVM Hypervisor where it > goes ahead and imposes the silent denials. > > The right idea is to follow the kernel.org > commit 1c7de2b4ff88 ("PCI: Enable access to non-standard VPD for > Chelsio devices (cxgb3)") which Alexey Kardashevskiy authored > to establish a PCI Quirk for our T3-based adapters. This commit > extends that PCI Quirk to cover Chelsio T4 devices and later. > > The advantage of this approach is that the VPD Size gets set early > in the Base OS/Hypervisor Boot and doesn't require that the cxgb4 > driver even be available in the Base OS/Hypervisor. Thus PF4 can > be exported to a Virtual Machine and everything should work. > > Fixes: 67e658794ca1 ("cxgb4: Set VPD size so we can read both VPD structures") > Cc: # v4.9+ > Signed-off-by: Casey Leedom > Signed-off-by: Arjun Vynipadath > Signed-off-by: Ganesh Goudar Applied, thanks.
Re: [PATCH 0/3] Remove IPVlan module dependencies on IPv6 and Netfilter
From: Matteo CroceDate: Thu, 15 Feb 2018 15:04:52 +0100 > What about the other two, removing IPv6 and change the Kconfig? > Other devices like VXLan, Geneve and VRF uses the same architecture > to allow conditional compilation of the IPv6 module, I think that > IPVlan should do the same. Ok, those parts should be reasonable, please respin.
Re: [PATCH net-next] cxgb4: append firmware dump to vmcore in kernel panic
From: Rahul LakkireddyDate: Thu, 15 Feb 2018 19:24:42 +0530 > Register callback to panic_notifier_list. Invoke dump collect routine > to append dump to vmcore. > > Signed-off-by: Rahul Lakkireddy > Signed-off-by: Ganesh Goudar There is absolutely no precedence for a networking driver dumping things into the vmcore image on a panic. And I don't think this is a good idea. Really, this commit message should have explained why this is desired and in what context it is legitimate for this driver in particular to do it. A very detailed, long, complete commit message is especially important when you are deciding to blaze you own trail and do something no other networking driver has done before. I get really upset when I see changes like this, because you give me no preparation for what I'm about to read in the patch and therefore I have to go into this routine asking you to explain things properly. But as-is, I see this panic notifier as a really bad idea. Thanks.
Re: [PATCH net-next 0/2] net: dsa: mv88e6xxx: Improve PTP access latency
From: Andrew LunnDate: Thu, 15 Feb 2018 14:38:33 +0100 > PTP needs to retrieve the hardware timestamps from the switch device > in a low latency manor. However ethtool -S and bridge fdb show can > hold the switch register access mutex for a long time. These patches > changes the reading the statistics and the ATU so that the mutex is > released and taken again between each statistic or ATU entry. The PTP > code can then interleave its access to the hardware, keeping its > latency low. Series applied, thanks Andrew.
Re: [net-next v2 1/1] tipc: avoid unnecessary copying of bundled messages
From: Jon MaloyDate: Thu, 15 Feb 2018 14:14:37 +0100 > A received sk buffer may contain dozens of smaller 'bundled' messages > which after extraction go each in their own direction. > > Unfortunately, when we extract those messages using skb_clone() each > of the extracted buffers inherit the truesize value of the original > buffer. Apart from causing massive overaccounting of the base buffer's > memory, this often causes tipc_msg_validate() to come to the false > conclusion that the ratio truesize/datasize > 4, and perform an > unnecessary copying of the extracted buffer. > > We now fix this problem by explicitly correcting the truesize value of > the buffer clones to be the truesize of the clone itself plus a > calculated fraction of the base buffer's overhead. This change > eliminates the overaccounting and at least mitigates the occurrence > of unnecessary buffer copying. > > Reported-by: Hoang Le > Acked-by: Ying Xue > Signed-off-by: Jon Maloy As I explained in my previous two emails, I don't think this method of accounting is appropriate. All of your clones must use the same skb->truesize as the original SKB because each and every one of them keeps the full buffer from being liberated until they are released.
Re: [PATCH net] cxgb4: fix trailing zero in CIM LA dump
From: Rahul LakkireddyDate: Thu, 15 Feb 2018 18:20:01 +0530 > Set correct size of the CIM LA dump for T6. > > Fixes: 27887bc7cb7f ("cxgb4: collect hardware LA dumps") > Signed-off-by: Rahul Lakkireddy > Signed-off-by: Ganesh Goudar Applied and queued up for -stable, thanks.
Re: [PATCH net] cxgb4: free up resources of pf 0-3
From: Ganesh GoudarDate: Thu, 15 Feb 2018 18:16:57 +0530 > free pf 0-3 resources, commit baf5086840ab ("cxgb4: > restructure VF mgmt code") erroneously removed the > code which frees the pf 0-3 resources, causing the > probe of pf 0-3 to fail in case of driver reload. > > Fixes: baf5086840ab ("cxgb4: restructure VF mgmt code") > Signed-off-by: Ganesh Goudar Series applied, thank you.
Re: [net-next 00/10] tipc: de-generealize topology server
From: Jon MaloyDate: Thu, 15 Feb 2018 10:40:41 +0100 > The topology server is partially based on a template that is much > more generic than what we need. This results in a code that is > unnecessarily hard to follow and keeping bug free. > > We now take the consequence of the fact that we only have one such > server in TIPC, - with no prospects for introducing any more, and > adapt the code to the specialized task is really is doing. Nice cleanup, series applied, thanks Jon.
Re: [net-next 1/1] tipc: avoid unnecessary copying of bundled messages
From: Jon MaloyDate: Thu, 15 Feb 2018 08:57:14 + > The buffers we are cloning are linearized 1 MTU incoming > buffers. There are no fragments. Each clone normally points to only > a tiny fraction of the data area of the base buffer. I don't claim > that copying always is bad, but in this case it happens in the > majority of cases, and as I see it completely unnecessarily. > > There is actually some under accounting, however, since we now only > count the data area of the base buffer (== the sum of the data area > of the clones) plus the overhead of the clones. A more accurate > calculation, taking into account even the overhead of the base > buffer, would look like this: (*iskb)->truesize = SKB_TRUSIZE(imsz) > + (skb->truesize - skb->len) / msg_msgcnt(msg); > > I.e., we calculate the overhead of the base buffer and divide it > equally among the clones. Now I really can't see we are missing > anything. Maybe there is some miscommunication between us. Let me try using different words. I you have a single SKB which has a truesize of, say, 10K and then you clone this several times to point the SKB into small windows into that original SKB's buffer, then you must use the same 10K truesize for the clones that you did for the original SKB. It absolutely does not matter that the clones are only pointing to a small part of the original SKB's buffer. Until they are freed up each individual SKB still causes that _ENTIRE_ buffer to be held up and not freeable. This is why you have to be careful how you manage SKBs, and in fact what you are doing by cloning and constricting the data pointers, is actually troublesome and you can see this with the truesize problems you are running into. SKBs are really not built to be managed like this.
Re: [PATCH net v2] fib_semantics: Don't match route with mismatching tclassid
From: Stefano BrivioDate: Thu, 15 Feb 2018 09:46:03 +0100 > In fib_nh_match(), if output interface or gateway are passed in > the FIB configuration, we don't have to check next hops of > multipath routes to conclude whether we have a match or not. > > However, we might still have routes with different realms > matching the same output interface and gateway configuration, > and this needs to cause the match to fail. Otherwise the first > route inserted in the FIB will match, regardless of the realms: > > # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2 > # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4 > # ip route list table 1234 > 1.1.1.1 dev eth0 scope link realms 1/2 > 1.1.1.1 dev eth0 scope link realms 3/4 > # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4 > # ip route list table 1234 > 1.1.1.1 dev ens3 scope link realms 3/4 > > whereas route with realms 3/4 should have been deleted instead. > > Explicitly check for fc_flow passed in the FIB configuration > (this comes from RTA_FLOW extracted by rtm_to_fib_config()) and > fail matching if it differs from nh_tclassid. > > The handling of RTA_FLOW for multipath routes later in > fib_nh_match() is still needed, as we can have multiple RTA_FLOW > attributes that need to be matched against the tclassid of each > next hop. > > v2: Check that fc_flow is set before discarding the match, so > that the user can still select the first matching rule by > not specifying any realm, as suggested by David Ahern. > > Reported-by: Jianlin Shi > Signed-off-by: Stefano Brivio Applied and queued up for -stable, thanks.
Re: [PATCH V6 2/4] sctp: Add ip option support
On Fri, Feb 16, 2018 at 10:56:07AM -0200, Marcelo Ricardo Leitner wrote: > On Thu, Feb 15, 2018 at 09:15:40AM -0500, Neil Horman wrote: > > On Tue, Feb 13, 2018 at 08:54:44PM +, Richard Haines wrote: > > > Add ip option support to allow LSM security modules to utilise CIPSO/IPv4 > > > and CALIPSO/IPv6 services. > > > > > > Signed-off-by: Richard Haines> > > --- > > > include/net/sctp/sctp.h| 4 +++- > > > include/net/sctp/structs.h | 2 ++ > > > net/sctp/chunk.c | 12 +++- > > > net/sctp/ipv6.c| 42 > > > +++--- > > > net/sctp/output.c | 5 - > > > net/sctp/protocol.c| 36 > > > net/sctp/socket.c | 14 ++ > > > 7 files changed, 97 insertions(+), 18 deletions(-) > > > > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h > > > index f7ae6b0..25c5c87 100644 > > > --- a/include/net/sctp/sctp.h > > > +++ b/include/net/sctp/sctp.h > > > @@ -441,9 +441,11 @@ static inline int sctp_list_single_entry(struct > > > list_head *head) > > > static inline int sctp_frag_point(const struct sctp_association *asoc, > > > int pmtu) > > > { > > > struct sctp_sock *sp = sctp_sk(asoc->base.sk); > > > + struct sctp_af *af = sp->pf->af; > > > int frag = pmtu; > > > > > > - frag -= sp->pf->af->net_header_len; > > > + frag -= af->ip_options_len(asoc->base.sk); > > > + frag -= af->net_header_len; > > > frag -= sizeof(struct sctphdr) + sctp_datachk_len(>stream); > > > > > > if (asoc->user_frag) > > > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h > > > index 03e92dd..ead5fce 100644 > > > --- a/include/net/sctp/structs.h > > > +++ b/include/net/sctp/structs.h > > > @@ -491,6 +491,7 @@ struct sctp_af { > > > void(*ecn_capable)(struct sock *sk); > > > __u16 net_header_len; > > > int sockaddr_len; > > > + int (*ip_options_len)(struct sock *sk); > > > sa_family_t sa_family; > > > struct list_head list; > > > }; > > > @@ -515,6 +516,7 @@ struct sctp_pf { > > > int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); > > > void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); > > > void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); > > > + void (*copy_ip_options)(struct sock *sk, struct sock *newsk); > > > struct sctp_af *af; > > > }; > > > > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c > > > index 991a530..d5c0ef7 100644 > > > --- a/net/sctp/chunk.c > > > +++ b/net/sctp/chunk.c > > > @@ -154,7 +154,6 @@ static void sctp_datamsg_assign(struct sctp_datamsg > > > *msg, struct sctp_chunk *chu > > > chunk->msg = msg; > > > } > > > > > > - > > > /* A data chunk can have a maximum payload of (2^16 - 20). Break > > > * down any such message into smaller chunks. Opportunistically, > > > fragment > > > * the chunks down to the current MTU constraints. We may get > > > refragmented > > > @@ -171,6 +170,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > sctp_association *asoc, > > > struct list_head *pos, *temp; > > > struct sctp_chunk *chunk; > > > struct sctp_datamsg *msg; > > > + struct sctp_sock *sp; > > > + struct sctp_af *af; > > > int err; > > > > > > msg = sctp_datamsg_new(GFP_KERNEL); > > > @@ -189,9 +190,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > sctp_association *asoc, > > > /* This is the biggest possible DATA chunk that can fit into > > >* the packet > > >*/ > > > - max_data = asoc->pathmtu - > > > -sctp_sk(asoc->base.sk)->pf->af->net_header_len - > > > -sizeof(struct sctphdr) - sctp_datachk_len(>stream); > > > + sp = sctp_sk(asoc->base.sk); > > > + af = sp->pf->af; > > > + max_data = asoc->pathmtu - af->net_header_len - > > > +sizeof(struct sctphdr) - sctp_datachk_len(>stream) - > > > +af->ip_options_len(asoc->base.sk); > > > max_data = SCTP_TRUNC4(max_data); > > > > > > /* If the the peer requested that we authenticate DATA chunks > > > @@ -211,7 +214,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct > > > sctp_association *asoc, > > > > > > /* Set first_len and then account for possible bundles on first frag */ > > > first_len = max_data; > > > - > > > /* Check to see if we have a pending SACK and try to let it be bundled > > >* with this message. Do this if we don't have any data queued already. > > >* To check that, look at out_qlen and retransmit list. > > > diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c > > > index e35d4f7..0b0f895 100644 > > > --- a/net/sctp/ipv6.c > > > +++ b/net/sctp/ipv6.c > > > @@ -427,6 +427,38 @@ static void sctp_v6_copy_addrlist(struct list_head > > > *addrlist, > > > rcu_read_unlock(); > > > } > > > > > > +/* Copy over any ip options */ > > > +static void sctp_v6_copy_ip_options(struct sock *sk, struct sock
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi. On pátek 16. února 2018 18:56:12 CET Holger Hoffstätte wrote: > There is simply no reason why you shouldn't get approx. line rate > (~920+-ish) Mbit over wired 1GBit Ethernet; even my broken 10-year old > Core2Duo laptop can do that. Can you boot with spectre_v2=off and try "the > simplest case" with the defaults cubic/pfifo_fast? spectre_v2 has terrible > performance impact esp. on small/older processors. Just have tried. No visible difference. > When I last benchmarked full PREEMPT with 4.9.x it was similarly bad and > also had a noticeable network throughput impact even on my i7. > > Also congratulations for being the only other person I know who ever tried > YeAH. :-) Well, according to the git log on tcp_yeah.c and Reported-by tag, I was not the only one there ;). Regards, Oleksandr
Re: [PATCH v2] ravb: add support for changing MTU
On 02/16/2018 11:43 AM, Sergei Shtylyov wrote: > Hello! > > On 02/16/2018 10:42 PM, Florian Fainelli wrote: > >>> Allow for changing the MTU within the limit of the maximum size of a >>> descriptor (2048 bytes). Add the callback to change MTU from user-space >>> and take the configurable MTU into account when configuring the >>> hardware. >>> >>> Signed-off-by: Niklas Söderlund>>> --- >> >>> >>> +static int ravb_change_mtu(struct net_device *ndev, int new_mtu) >>> +{ >>> + if (netif_running(ndev)) >>> + return -EBUSY; >>> + >>> + ndev->mtu = new_mtu; >>> + netdev_update_features(ndev); >> >> Don't you somehow need to quiesce the RX DMA and make sure you that you >> re-allocate all RX buffers within priv->rx_skb[q][entry] such that they >> will be able to accept a larger buffer size? >> >> If we put the ravb interface under high RX load and we change the MTU on >> the fly, can we crash the kernel? > >I thought the netif_running() check guards against that. Does it not? It does, completely missed that, thanks! -- Florian
Re: [PATCH v2] ravb: add support for changing MTU
Hello! On 02/16/2018 10:42 PM, Florian Fainelli wrote: >> Allow for changing the MTU within the limit of the maximum size of a >> descriptor (2048 bytes). Add the callback to change MTU from user-space >> and take the configurable MTU into account when configuring the >> hardware. >> >> Signed-off-by: Niklas Söderlund>> --- > >> >> +static int ravb_change_mtu(struct net_device *ndev, int new_mtu) >> +{ >> +if (netif_running(ndev)) >> +return -EBUSY; >> + >> +ndev->mtu = new_mtu; >> +netdev_update_features(ndev); > > Don't you somehow need to quiesce the RX DMA and make sure you that you > re-allocate all RX buffers within priv->rx_skb[q][entry] such that they > will be able to accept a larger buffer size? > > If we put the ravb interface under high RX load and we change the MTU on > the fly, can we crash the kernel? I thought the netif_running() check guards against that. Does it not? MBR, Sergei
Re: [PATCH v2] ravb: add support for changing MTU
On 02/16/2018 08:10 AM, Niklas Söderlund wrote: > Allow for changing the MTU within the limit of the maximum size of a > descriptor (2048 bytes). Add the callback to change MTU from user-space > and take the configurable MTU into account when configuring the > hardware. > > Signed-off-by: Niklas Söderlund> --- > > +static int ravb_change_mtu(struct net_device *ndev, int new_mtu) > +{ > + if (netif_running(ndev)) > + return -EBUSY; > + > + ndev->mtu = new_mtu; > + netdev_update_features(ndev); Don't you somehow need to quiesce the RX DMA and make sure you that you re-allocate all RX buffers within priv->rx_skb[q][entry] such that they will be able to accept a larger buffer size? If we put the ravb interface under high RX load and we change the MTU on the fly, can we crash the kernel? -- Florian
[PATCH net-next] net: Only honor ifindex in IP_PKTINFO if non-0
Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings if the index is actually set in the message. Signed-off-by: David Ahern--- net/ipv4/ip_sockglue.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 008be04ac1cc..9dca0fb8c482 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(_info->ipi6_addr)) return -EINVAL; - ipc->oif = src_info->ipi6_ifindex; + if (src_info->ipi6_ifindex) + ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } @@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); - ipc->oif = info->ipi_ifindex; + if (info->ipi_ifindex) + ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } -- 2.11.0
Re: ppp/pppoe, still panic 4.15.3 in ppp_push
On Fri, Feb 16, 2018 at 01:13:18PM +0200, Denys Fedoryshchenko wrote: > On 2018-02-15 21:42, Guillaume Nault wrote: > > On Thu, Feb 15, 2018 at 09:34:42PM +0200, Denys Fedoryshchenko wrote: > > > On 2018-02-15 21:31, Guillaume Nault wrote: > > > > On Thu, Feb 15, 2018 at 06:01:16PM +0200, Denys Fedoryshchenko wrote: > > > > > On 2018-02-15 17:55, Guillaume Nault wrote: > > > > > > On Thu, Feb 15, 2018 at 12:19:52PM +0200, Denys Fedoryshchenko > > > > > > wrote: > > > > > > > Here we go: > > > > > > > > > > > > > > [24558.921549] > > > > > > > == > > > > > > > [24558.922167] BUG: KASAN: use-after-free in > > > > > > > ppp_ioctl+0xa6a/0x1522 > > > > > > > [ppp_generic] > > > > > > > [24558.922776] Write of size 8 at addr 8803d35bf3f8 by > > > > > > > task > > > > > > > accel-pppd/12622 > > > > > > > [24558.923113] > > > > > > > [24558.923451] CPU: 0 PID: 12622 Comm: accel-pppd Tainted: > > > > > > > G > > > > > > > W > > > > > > > 4.15.3-build-0134 #1 > > > > > > > [24558.924058] Hardware name: HP ProLiant DL320e Gen8 v2, > > > > > > > BIOS P80 > > > > > > > 04/02/2015 > > > > > > > [24558.924406] Call Trace: > > > > > > > [24558.924753] dump_stack+0x46/0x59 > > > > > > > [24558.925103] print_address_description+0x6b/0x23b > > > > > > > [24558.925451] ? ppp_ioctl+0xa6a/0x1522 [ppp_generic] > > > > > > > [24558.925797] kasan_report+0x21b/0x241 > > > > > > > [24558.926136] ppp_ioctl+0xa6a/0x1522 [ppp_generic] > > > > > > > [24558.926479] ? ppp_nl_newlink+0x1da/0x1da [ppp_generic] > > > > > > > [24558.926829] ? sock_sendmsg+0x89/0x99 > > > > > > > [24558.927176] ? __vfs_write+0xd9/0x4ad > > > > > > > [24558.927523] ? kernel_read+0xed/0xed > > > > > > > [24558.927872] ? SyS_getpeername+0x18c/0x18c > > > > > > > [24558.928213] ? bit_waitqueue+0x2a/0x2a > > > > > > > [24558.928561] ? wake_atomic_t_function+0x115/0x115 > > > > > > > [24558.928898] vfs_ioctl+0x6e/0x81 > > > > > > > [24558.929228] do_vfs_ioctl+0xa00/0xb10 > > > > > > > [24558.929571] ? sigprocmask+0x1a6/0x1d0 > > > > > > > [24558.929907] ? sigsuspend+0x13e/0x13e > > > > > > > [24558.930239] ? ioctl_preallocate+0x14e/0x14e > > > > > > > [24558.930568] ? SyS_rt_sigprocmask+0xf1/0x142 > > > > > > > [24558.930904] ? sigprocmask+0x1d0/0x1d0 > > > > > > > [24558.931252] SyS_ioctl+0x39/0x55 > > > > > > > [24558.931595] ? do_vfs_ioctl+0xb10/0xb10 > > > > > > > [24558.931942] do_syscall_64+0x1b1/0x31f > > > > > > > [24558.932288] entry_SYSCALL_64_after_hwframe+0x21/0x86 > > > > > > > [24558.932627] RIP: 0033:0x7f302849d8a7 > > > > > > > [24558.932965] RSP: 002b:7f3029a52af8 EFLAGS: 0206 > > > > > > > ORIG_RAX: > > > > > > > 0010 > > > > > > > [24558.933578] RAX: ffda RBX: 7f3027d861e3 > > > > > > > RCX: > > > > > > > 7f302849d8a7 > > > > > > > [24558.933927] RDX: 7f3023f49468 RSI: 4004743a > > > > > > > RDI: > > > > > > > 3a67 > > > > > > > [24558.934266] RBP: 7f3029a52b20 R08: > > > > > > > R09: > > > > > > > 55c8308d8e40 > > > > > > > [24558.934607] R10: 0008 R11: 0206 > > > > > > > R12: > > > > > > > 7f3023f49358 > > > > > > > [24558.934947] R13: 7ffe86e5723f R14: > > > > > > > R15: > > > > > > > 7f3029a53700 > > > > > > > [24558.935288] > > > > > > > [24558.935626] Allocated by task 12622: > > > > > > > [24558.935972] ppp_register_net_channel+0x5f/0x5c6 > > > > > > > [ppp_generic] > > > > > > > [24558.936306] pppoe_connect+0xab7/0xc71 [pppoe] > > > > > > > [24558.936640] SyS_connect+0x14b/0x1b7 > > > > > > > [24558.936975] do_syscall_64+0x1b1/0x31f > > > > > > > [24558.937319] entry_SYSCALL_64_after_hwframe+0x21/0x86 > > > > > > > [24558.937655] > > > > > > > [24558.937993] Freed by task 12622: > > > > > > > [24558.938321] kfree+0xb0/0x11d > > > > > > > [24558.938658] ppp_release+0x111/0x120 [ppp_generic] > > > > > > > [24558.938994] __fput+0x2ba/0x51a > > > > > > > [24558.939332] task_work_run+0x11c/0x13d > > > > > > > [24558.939676] exit_to_usermode_loop+0x7c/0xaf > > > > > > > [24558.940022] do_syscall_64+0x2ea/0x31f > > > > > > > [24558.940368] entry_SYSCALL_64_after_hwframe+0x21/0x86 > > > > > > > [24558.947099] > > > > > > > > > > > > Your first guess was right. It looks like we have an issue with > > > > > > reference counting on the channels. Can you send me your > > > > > > ppp_generic.o? > > > > > http://nuclearcat.com/ppp_generic.o > > > > > Compiled with gcc version 6.4.0 (Gentoo 6.4.0-r1 p1.3) > > > > > > > > > From what I can see, ppp_release() and ioctl(PPPIOCCONNECT) are called > > > > concurrently on the same ppp_file. Even if this ppp_file was pointed at > > > > by two different file descriptors, I can't see how this could defeat > > > > the reference counting mechanism. I'm going
Re: [PATCH net-next 0/3] eBPF Seccomp filters
On Wed, Feb 14, 2018 at 8:30 PM, Alexei Starovoitovwrote: > On Wed, Feb 14, 2018 at 10:32:22AM -0700, Tycho Andersen wrote: >> > > >> > > What's the reason for adding eBPF support? seccomp shouldn't need it, >> > > and it only makes the code more complex. I'd rather stick with cBPF >> > > until we have an overwhelmingly good reason to use eBPF as a "native" >> > > seccomp filter language. >> > > >> > >> > I can think of two fairly strong use cases for eBPF's ability to call >> > functions: logging and Tycho's user notifier thing. >> >> Worth noting that there is one additional thing that I didn't >> implement, but which would be nice and is probably not possible with >> eBPF (at least, not without a bunch of additional infrastructure): >> passing fds back to the tracee from the manager if you intercept >> socket(), or accept() or something. >> >> This could again be accomplished via other means, though it would be a >> lot nicer to have a primitive for it. > > there is bpf_perf_event_output() interface that allows to stream > arbitrary data from kernel into user space via perf ring buffer. > User space can epoll on it. We use this in both tracing and networking > for notifications and streaming data transfers. > I suspect this can be used for 'logging' too, since it's cheap and fast. > > Specifically for android we added bpf_lsm hooks, cookie/uid helpers, > and read-only maps. > Lorenzo, > there was a claim in this thread that bpf is disabled on android. > Can you please clarify ? > If it's actually disabled and there is no intent to enable it, > I'd rather not add any more android specific features to bpf. > > What I think is important to understand is that BPF goes through > very active development. The verifier is constantly getting smarter. > There is work to add bounded loops, lock/unlock, get/put tracking, > global/percpu variables, dynamic linking and so on. > Most of the features are available to root only and unpriv > has very limited set. Like getting bpf_perf_event_output() to work > for unpriv will likely require additional verifier work. > > So all cool bits will not be usable by seccomp+eBPF and unpriv > on day one. It's not a lot of work either, but once it's done > I'd hate to see arguments against adding more verifier features > just because eBPF is used by seccomp/landlock/other_security_thing. > > Also I think the argument that seccomp+eBPF will be faster than > seccomp+cBPF is a weak one. I bet kpti on/off makes no difference > under seccomp, since _all_ syscalls are already slow for sandboxed app. > Instead of making seccomp 5% faster with eBPF, I think it's > worth looking into extending LSM hooks to cover all syscalls and > have programmable (bpf or whatever) filtering applied per syscall. > Like we can have a white list syscall table covered by lsm hooks > and any other syscall will get into old seccomp-style > filtering category automatically. > lsm+bpf would need to follow process hierarchy. It shouldn't be > a runtime check at syscall entry either, but compile time > extra branch in SYSCALL_DEFINE for non-whitelisted syscalls. > There are bunch of other things to figure out, but I think > the perf win will be bigger than replacing cBPF with eBPF in > existing seccomp. > Given this test program: for (i = 10; i < ; i++) syscall(__NR_getpid); If I implement an eBPF filter with PROG_ARRAYs, and tail call, the numbers are such: ebpf JIT 12.3% slower than native ebpf no JIT 13.6% slower than native seccomp JIT 17.6% slower than native seccomp no JIT 37% slower than native This is using libseccomp for the standard seccomp BPF program. There's no reasonable way for our workload to know which syscalls come "earlier", so we can't take that optimization. Potentially, libseccomp can be smarter about ordering cases (using ranges), and use an O(log(n)) search algorithm, but both of these are microptimizations that scale with the number of syscalls and per-syscall rules. The nicety of using a PROG_ARRAY means that adding additional filters (syscalls) comes at no cost, whereas there's a tradeoff any time you add another rule in traditional seccomp filters. This was tested on an Amazon M4.16XL running with pcid, and KPTI.
Re: [PATCH v3 2/2] net: ethernet: nixge: Add support for National Instruments XGE netdev
On Fri, Feb 16, 2018 at 09:00:33AM -0800, Moritz Fischer wrote: > +#define NIXGE_MDIO_CLAUSE45 BIT(12) > +#define NIXGE_MDIO_CLAUSE22 0 > +#define NIXGE_MDIO_OP(n) (((n) & 0x3) << 10) > +#define NIXGE_MDIO_OP_ADDRESS0 > +#define NIXGE_MDIO_OP_WRITE BIT(0) > +#define NIXGE_MDIO_OP_READ (BIT(1) | BIT(0)) > +#define MDIO_C22_WRITE BIT(0) > +#define MDIO_C22_READBIT(1) > +#define MDIO_READ_POST 2 Hi Moritz NIXGE prefix? > +#define NIXGE_MDIO_ADDR(n) (((n) & 0x1f) << 5) > +#define NIXGE_MDIO_MMD(n)(((n) & 0x1f) << 0) > + > +#define NIXGE_MAX_PHY_ADDR 32 This is nothing specific to this device. Please use the standard PHY_MAX_ADDR. > +static int nixge_open(struct net_device *ndev) > +{ > + struct nixge_priv *priv = netdev_priv(ndev); > + struct phy_device *phy; > + int ret; > + > + nixge_device_reset(ndev); > + > + phy = of_phy_connect(ndev, priv->phy_node, > + _handle_link_change, 0, priv->phy_mode); > + if (!phy) > + return -ENODEV; > + > + phy_start(phy); > + > + /* Enable tasklets for Axi DMA error handling */ > + tasklet_init(>dma_err_tasklet, nixge_dma_err_handler, > + (unsigned long)priv); > + > + napi_enable(>napi); > + > + /* Enable interrupts for Axi DMA Tx */ > + ret = request_irq(priv->tx_irq, nixge_tx_irq, 0, ndev->name, ndev); > + if (ret) > + goto err_tx_irq; > + /* Enable interrupts for Axi DMA Rx */ > + ret = request_irq(priv->rx_irq, nixge_rx_irq, 0, ndev->name, ndev); > + if (ret) > + goto err_rx_irq; > + > + netif_start_queue(ndev); > + > + return 0; > + > +err_rx_irq: > + free_irq(priv->tx_irq, ndev); > +err_tx_irq: > + tasklet_kill(>dma_err_tasklet); > + netdev_err(ndev, "request_irq() failed\n"); > + return ret; Maybe you also want to stop the phy and disconnect it? > +static int nixge_stop(struct net_device *ndev) > +{ > + struct nixge_priv *priv = netdev_priv(ndev); > + u32 cr; > + > + netif_stop_queue(ndev); > + napi_disable(>napi); > + > + if (ndev->phydev) > + phy_stop(ndev->phydev); phy_disconnect() ? > + cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET); > + nixge_dma_write_reg(priv, XAXIDMA_RX_CR_OFFSET, > + cr & (~XAXIDMA_CR_RUNSTOP_MASK)); > + cr = nixge_dma_read_reg(priv, XAXIDMA_TX_CR_OFFSET); > + nixge_dma_write_reg(priv, XAXIDMA_TX_CR_OFFSET, > + cr & (~XAXIDMA_CR_RUNSTOP_MASK)); > + > + tasklet_kill(>dma_err_tasklet); > + > + free_irq(priv->tx_irq, ndev); > + free_irq(priv->rx_irq, ndev); > + > + nixge_hw_dma_bd_release(ndev); > + > + return 0; > +} > +static int nixge_mdio_read(struct mii_bus *bus, int phy_id, int reg) > +{ > + struct nixge_priv *priv = bus->priv; > + u32 status, tmp; > + int err; > + u16 device; > + > + if (reg & MII_ADDR_C45) { > + device = (reg >> 16) & 0x1f; > + > + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_ADDR, reg & 0x); > + > + tmp = NIXGE_MDIO_CLAUSE45 | NIXGE_MDIO_OP(NIXGE_MDIO_OP_ADDRESS) > + | NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device); > + > + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_OP, tmp); > + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_CTRL, 1); > + > + err = nixge_ctrl_poll_timeout(priv, NIXGE_REG_MDIO_CTRL, status, > + !status, 10, 1000); > + if (err) { > + dev_err(priv->dev, "timeout setting address"); > + return err; > + } > + > + tmp = NIXGE_MDIO_CLAUSE45 | NIXGE_MDIO_OP(NIXGE_MDIO_OP_READ) | > + NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device); > + } else { > + device = reg & 0x1f; > + > + tmp = NIXGE_MDIO_CLAUSE22 | NIXGE_MDIO_OP(MDIO_C22_READ) | > + NIXGE_MDIO_ADDR(phy_id) | NIXGE_MDIO_MMD(device); It would be nice to have some naming consistency here. NIXGE_MDIO_C45_READ and NIXGE_MDIO_C22_READ? > + } > + > + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_OP, tmp); > + nixge_ctrl_write_reg(priv, NIXGE_REG_MDIO_CTRL, 1); > + > + err = nixge_ctrl_poll_timeout(priv, NIXGE_REG_MDIO_CTRL, status, > + !status, 10, 1000); > + if (err) { > + dev_err(priv->dev, "timeout setting read command"); > + return err; > + } > + > + status = nixge_ctrl_read_reg(priv, NIXGE_REG_MDIO_DATA); > + > + return status; > +} > + > +static int nixge_mdio_setup(struct nixge_priv *priv, struct device_node *np) > +{ > + struct mii_bus *bus; > + int err; > + > + bus = mdiobus_alloc(); devm_mdiobus_alloc() ? > + if (!bus) > + return -ENOMEM; > + > + snprintf(bus->id, MII_BUS_ID_SIZE,
[RFC PATCH v3 3/3] virtio_net: Enable alternate datapath without creating an additional netdev
This patch addresses the issues that were seen with the 3 netdev model by avoiding the creation of an additional netdev. Instead the bypass state information is tracked in the original netdev and a different set of ndo_ops and ethtool_ops are used when BACKUP feature is enabled. Signed-off-by: Sridhar SamudralaReviewed-by: Alexander Duyck --- drivers/net/virtio_net.c | 283 +-- 1 file changed, 101 insertions(+), 182 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 14679806c1b1..c85b2949f151 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -154,7 +154,7 @@ struct virtnet_bypass_info { struct net_device __rcu *active_netdev; /* virtio_net netdev */ - struct net_device __rcu *backup_netdev; + struct net_device *backup_netdev; /* active netdev stats */ struct rtnl_link_stats64 active_stats; @@ -229,8 +229,8 @@ struct virtnet_info { unsigned long guest_offloads; - /* upper netdev created when BACKUP feature enabled */ - struct net_device *bypass_netdev; + /* bypass state maintained when BACKUP feature is enabled */ + struct virtnet_bypass_info *vbi; }; struct padded_vnet_hdr { @@ -2285,6 +2285,22 @@ static bool virtnet_bypass_xmit_ready(struct net_device *dev) return netif_running(dev) && netif_carrier_ok(dev); } +static bool virtnet_bypass_active_ready(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_bypass_info *vbi = vi->vbi; + struct net_device *active; + + if (!vbi) + return false; + + active = rcu_dereference(vbi->active_netdev); + if (!active || !virtnet_bypass_xmit_ready(active)) + return false; + + return true; +} + static void virtnet_config_changed_work(struct work_struct *work) { struct virtnet_info *vi = @@ -2312,7 +2328,7 @@ static void virtnet_config_changed_work(struct work_struct *work) virtnet_update_settings(vi); netif_carrier_on(vi->dev); netif_tx_wake_all_queues(vi->dev); - } else { + } else if (!virtnet_bypass_active_ready(vi->dev)) { netif_carrier_off(vi->dev); netif_tx_stop_all_queues(vi->dev); } @@ -2501,7 +2517,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi) if (vi->has_cvq) { vi->cvq = vqs[total_vqs - 1]; - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN) && + !virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP)) vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } @@ -2690,62 +2707,54 @@ virtnet_bypass_child_open(struct net_device *dev, static int virtnet_bypass_open(struct net_device *dev) { - struct virtnet_bypass_info *vbi = netdev_priv(dev); + struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_bypass_info *vbi = vi->vbi; struct net_device *child_netdev; - - netif_carrier_off(dev); - netif_tx_wake_all_queues(dev); + int err; child_netdev = rtnl_dereference(vbi->active_netdev); if (child_netdev) virtnet_bypass_child_open(dev, child_netdev); - child_netdev = rtnl_dereference(vbi->backup_netdev); - if (child_netdev) - virtnet_bypass_child_open(dev, child_netdev); + err = virtnet_open(dev); + if (err < 0) { + dev_close(child_netdev); + return err; + } return 0; } static int virtnet_bypass_close(struct net_device *dev) { - struct virtnet_bypass_info *vi = netdev_priv(dev); + struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_bypass_info *vbi = vi->vbi; struct net_device *child_netdev; - netif_tx_disable(dev); + virtnet_close(dev); - child_netdev = rtnl_dereference(vi->active_netdev); - if (child_netdev) - dev_close(child_netdev); + if (!vbi) + goto done; - child_netdev = rtnl_dereference(vi->backup_netdev); + child_netdev = rtnl_dereference(vbi->active_netdev); if (child_netdev) dev_close(child_netdev); +done: return 0; } -static netdev_tx_t -virtnet_bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev) -{ - atomic_long_inc(>tx_dropped); - dev_kfree_skb_any(skb); - return NETDEV_TX_OK; -} - static netdev_tx_t virtnet_bypass_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct virtnet_bypass_info *vbi = netdev_priv(dev); + struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_bypass_info *vbi = vi->vbi; struct net_device
[RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available
This patch enables virtio_net to switch over to a VF datapath when a VF netdev is present with the same MAC address. It allows live migration of a VM with a direct attached VF without the need to setup a bond/team between a VF and virtio net device in the guest. The hypervisor needs to enable only one datapath at any time so that packets don't get looped back to the VM over the other datapath. When a VF is plugged, the virtio datapath link state can be marked as down. The hypervisor needs to unplug the VF device from the guest on the source host and reset the MAC filter of the VF to initiate failover of datapath to virtio before starting the migration. After the migration is completed, the destination hypervisor sets the MAC filter on the VF and plugs it back to the guest to switch over to VF datapath. When BACKUP feature is enabled, an additional netdev(bypass netdev) is created that acts as a master device and tracks the state of the 2 lower netdevs. The original virtio_net netdev is marked as 'backup' netdev and a passthru device with the same MAC is registered as 'active' netdev. This patch is based on the discussion initiated by Jesse on this thread. https://marc.info/?l=linux-virtualization=151189725224231=2 Signed-off-by: Sridhar SamudralaSigned-off-by: Alexander Duyck --- drivers/net/virtio_net.c | 639 ++- 1 file changed, 638 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bcd13fe906ca..14679806c1b1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -147,6 +148,27 @@ struct receive_queue { struct xdp_rxq_info xdp_rxq; }; +/* bypass state maintained when BACKUP feature is enabled */ +struct virtnet_bypass_info { + /* passthru netdev with same MAC */ + struct net_device __rcu *active_netdev; + + /* virtio_net netdev */ + struct net_device __rcu *backup_netdev; + + /* active netdev stats */ + struct rtnl_link_stats64 active_stats; + + /* backup netdev stats */ + struct rtnl_link_stats64 backup_stats; + + /* aggregated stats */ + struct rtnl_link_stats64 bypass_stats; + + /* spinlock while updating stats */ + spinlock_t stats_lock; +}; + struct virtnet_info { struct virtio_device *vdev; struct virtqueue *cvq; @@ -206,6 +228,9 @@ struct virtnet_info { u32 speed; unsigned long guest_offloads; + + /* upper netdev created when BACKUP feature enabled */ + struct net_device *bypass_netdev; }; struct padded_vnet_hdr { @@ -2255,6 +2280,11 @@ static const struct net_device_ops virtnet_netdev = { .ndo_features_check = passthru_features_check, }; +static bool virtnet_bypass_xmit_ready(struct net_device *dev) +{ + return netif_running(dev) && netif_carrier_ok(dev); +} + static void virtnet_config_changed_work(struct work_struct *work) { struct virtnet_info *vi = @@ -2647,6 +2677,601 @@ static int virtnet_validate(struct virtio_device *vdev) return 0; } +static void +virtnet_bypass_child_open(struct net_device *dev, + struct net_device *child_netdev) +{ + int err = dev_open(child_netdev); + + if (err) + netdev_warn(dev, "unable to open slave: %s: %d\n", + child_netdev->name, err); +} + +static int virtnet_bypass_open(struct net_device *dev) +{ + struct virtnet_bypass_info *vbi = netdev_priv(dev); + struct net_device *child_netdev; + + netif_carrier_off(dev); + netif_tx_wake_all_queues(dev); + + child_netdev = rtnl_dereference(vbi->active_netdev); + if (child_netdev) + virtnet_bypass_child_open(dev, child_netdev); + + child_netdev = rtnl_dereference(vbi->backup_netdev); + if (child_netdev) + virtnet_bypass_child_open(dev, child_netdev); + + return 0; +} + +static int virtnet_bypass_close(struct net_device *dev) +{ + struct virtnet_bypass_info *vi = netdev_priv(dev); + struct net_device *child_netdev; + + netif_tx_disable(dev); + + child_netdev = rtnl_dereference(vi->active_netdev); + if (child_netdev) + dev_close(child_netdev); + + child_netdev = rtnl_dereference(vi->backup_netdev); + if (child_netdev) + dev_close(child_netdev); + + return 0; +} + +static netdev_tx_t +virtnet_bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev) +{ + atomic_long_inc(>tx_dropped); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; +} + +static netdev_tx_t +virtnet_bypass_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct virtnet_bypass_info *vbi = netdev_priv(dev); + struct net_device *xmit_dev; + + /* Try
[RFC PATCH v3 1/3] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
This feature bit can be used by hypervisor to indicate virtio_net device to act as a backup for another device with the same MAC address. VIRTIO_NET_F_BACKUP is defined as bit 62 as it is a device feature bit. Signed-off-by: Sridhar Samudrala--- drivers/net/virtio_net.c| 2 +- include/uapi/linux/virtio_net.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 626c27352ae2..bcd13fe906ca 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2920,7 +2920,7 @@ static struct virtio_device_id id_table[] = { VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ - VIRTIO_NET_F_SPEED_DUPLEX + VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_BACKUP static unsigned int features[] = { VIRTNET_FEATURES, diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 5de6ed37695b..c7c35fd1a5ed 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,9 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_BACKUP 62/* Act as backup for another device +* with the same MAC. +*/ #define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ #ifndef VIRTIO_NET_NO_LEGACY -- 2.14.3
[RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be used by hypervisor to indicate that virtio_net interface should act as a backup for another device with the same MAC address. Ppatch 2 is in response to the community request for a 3 netdev solution. However, it creates some issues we'll get into in a moment. It extends virtio_net to use alternate datapath when available and registered. When BACKUP feature is enabled, virtio_net driver creates an additional 'bypass' netdev that acts as a master device and controls 2 slave devices. The original virtio_net netdev is registered as 'backup' netdev and a passthru/vf device with the same MAC gets registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are associated with the same 'pci' device. The user accesses the network interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev as default for transmits when it is available with link up and running. We noticed a couple of issues with this approach during testing. - As both 'bypass' and 'backup' netdevs are associated with the same virtio pci device, udev tries to rename both of them with the same name and the 2nd rename will fail. This would be OK as long as the first netdev to be renamed is the 'bypass' netdev, but the order in which udev gets to rename the 2 netdevs is not reliable. - When the 'active' netdev is unplugged OR not present on a destination system after live migration, the user will see 2 virtio_net netdevs. Patch 3 refactors much of the changes made in patch 2, which was done on purpose just to show the solution we recommend as part of one patch set. If we submit a final version of this, we would combine patch 2/3 together. This patch removes the creation of an additional netdev, Instead, it uses a new virtnet_bypass_info struct added to the original 'backup' netdev to track the 'bypass' information and introduces an additional set of ndo and ethtool ops that are used when BACKUP feature is enabled. One difference with the 3 netdev model compared to the 2 netdev model is that the 'bypass' netdev is created with 'noqueue' qdisc marked as 'NETIF_F_LLTX'. This avoids going through an additional qdisc and acquiring an additional qdisc and tx lock during transmits. If we can replace the qdisc of virtio netdev dynamically, it should be possible to get these optimizations enabled even with 2 netdev model when BACKUP feature is enabled. As this patch series is initially focusing on usecases where hypervisor fully controls the VM networking and the guest is not expected to directly configure any hardware settings, it doesn't expose all the ndo/ethtool ops that are supported by virtio_net at this time. To support additional usecases, it should be possible to enable additional ops later by caching the state in virtio netdev and replaying when the 'active' netdev gets registered. The hypervisor needs to enable only one datapath at any time so that packets don't get looped back to the VM over the other datapath. When a VF is plugged, the virtio datapath link state can be marked as down. At the time of live migration, the hypervisor needs to unplug the VF device from the guest on the source host and reset the MAC filter of the VF to initiate failover of datapath to virtio before starting the migration. After the migration is completed, the destination hypervisor sets the MAC filter on the VF and plugs it back to the guest to switch over to VF datapath. This patch is based on the discussion initiated by Jesse on this thread. https://marc.info/?l=linux-virtualization=151189725224231=2 Sridhar Samudrala (3): virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit virtio_net: Extend virtio to use VF datapath when available virtio_net: Enable alternate datapath without creating an additional netdev drivers/net/virtio_net.c| 564 +++- include/uapi/linux/virtio_net.h | 3 + 2 files changed, 563 insertions(+), 4 deletions(-) -- 2.14.3
[PATCH] net: dsa: mv88e6xxx: hwtstamp: remove unnecessary range checking tests
_port_ is already known to be a valid index in the callers [1]. So these checks are unnecessary. [1] https://lkml.org/lkml/2018/2/16/469 Addresses-Coverity-ID: 1465287 Addresses-Coverity-ID: 1465291 Suggested-by: Richard CochranSigned-off-by: Gustavo A. R. Silva --- drivers/net/dsa/mv88e6xxx/hwtstamp.c | 9 - 1 file changed, 9 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c index b251d53..c6d6a35 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c @@ -179,9 +179,6 @@ int mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, int port, if (!chip->info->ptp_support) return -EOPNOTSUPP; - if (port < 0 || port >= mv88e6xxx_num_ports(chip)) - return -EINVAL; - if (copy_from_user(, ifr->ifr_data, sizeof(config))) return -EFAULT; @@ -206,9 +203,6 @@ int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, if (!chip->info->ptp_support) return -EOPNOTSUPP; - if (port < 0 || port >= mv88e6xxx_num_ports(chip)) - return -EINVAL; - return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ? -EFAULT : 0; } @@ -255,9 +249,6 @@ static u8 *mv88e6xxx_should_tstamp(struct mv88e6xxx_chip *chip, int port, if (!chip->info->ptp_support) return NULL; - if (port < 0 || port >= mv88e6xxx_num_ports(chip)) - return NULL; - hdr = parse_ptp_header(skb, type); if (!hdr) return NULL; -- 2.7.4
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On 02/16/18 18:25, Oleksandr Natalenko wrote: > So, going on with two real HW hosts. They are both running latest stock Arch > Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are > interconnected with 1 Gbps link (via switch if that matters). Using iperf3, > running each test for 20 seconds. > > Having BBR+fq_codel (or pfifo_fast, same result) on both hosts: > > Client to server: 112 Mbits/sec > Server to client: 96.1 Mbits/sec > > Having BBR+fq on both hosts: > > Client to server: 347 Mbits/sec > Server to client: 397 Mbits/sec > > Having YeAH+fq on both hosts: > > Client to server: 928 Mbits/sec > Server to client: 711 Mbits/sec > > (when the server generates traffic, the throughput is a little bit lower, as > you can see, but I assume that's because I have there low-power Silvermont > CPU, when the client has Ivy Bridge beast) There is simply no reason why you shouldn't get approx. line rate (~920+-ish) Mbit over wired 1GBit Ethernet; even my broken 10-year old Core2Duo laptop can do that. Can you boot with spectre_v2=off and try "the simplest case" with the defaults cubic/pfifo_fast? spectre_v2 has terrible performance impact esp. on small/older processors. When I last benchmarked full PREEMPT with 4.9.x it was similarly bad and also had a noticeable network throughput impact even on my i7. Also congratulations for being the only other person I know who ever tried YeAH. :-) cheers Holger
Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read
On 02/16/2018 09:56 AM, Richard Cochran wrote: On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote: On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote: _port_ is being used as index to array port_hwtstamp before verifying it is a non-negative number and a valid index at line 209 and 258: if (port < 0 || port >= mv88e6xxx_num_ports(chip)) Fix this by checking _port_ before using it as index to array port_hwtstamp. NAK. Port is already known to be valid in the callers. And so the real bug is the pointless range checking tests. I would welcome patches to remove those. I just sent a patch for this. Thank you both, Andrew and Richard for the feedback. -- Gustavo
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi. On pátek 16. února 2018 17:25:58 CET Eric Dumazet wrote: > The way TCP pacing works, it defaults to internal pacing using a hint > stored in the socket. > > If you change the qdisc while flow is alive, result could be unexpected. I don't change a qdisc while flow is alive. Either the VM is completely restarted, or iperf3 is restarted on both sides. > (TCP socket remembers that one FQ was supposed to handle the pacing) > > What results do you have if you use standard pfifo_fast ? Almost the same as with fq_codel (see my previous email with numbers). > I am asking because TCP pacing relies on High resolution timers, and > that might be weak on your VM. Also, I've switched to measuring things on a real HW only (also see previous email with numbers). Thanks. Regards, Oleksandr
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi. On pátek 16. února 2018 17:26:11 CET Holger Hoffstätte wrote: > These are very odd configurations. :) > Non-preempt/100 might well be too slow, whereas PREEMPT/1000 might simply > have too much overhead. Since the pacing is based on hrtimers, should HZ matter at all? Even if so, poor 1 Gbps link shouldn't drop to below 100 Mbps, for sure. > BBR in general will run with lower cwnd than e.g. Cubic or others. > That's a feature and necessary for WAN transfers. Okay, got it. > Something seems really wrong with your setup. I get completely > expected throughput on wired 1Gb between two hosts: > /* snip */ Yes, and that's strange :/. And that's why I'm wondering what I am missing since things cannot be *that* bad. > /* snip */ > Please note that BBR was developed to address the case of WAN transfers > (or more precisely high BDP paths) which often suffer from TCP throughput > collapse due to single packet loss events. While it might "work" in other > scenarios as well, strictly speaking delay-based anything is increasingly > less likely to work when there is no meaningful notion of delay - such > as on a LAN. (yes, this is very simplified..) > > The BBR mailing list has several nice reports why the current BBR > implementation (dubbed v1) has a few - sometimes severe - problems. > These are being addressed as we speak. > > (let me know if you want some of those tech reports by email. :) Well, yes, please, why not :). > /* snip */ > I'm not sure testing the old version without builtin pacing is going to help > matters in finding the actual problem. :) > Several people have reported severe performance regressions with 4.15.x, > maybe that's related. Can you test latest 4.14.x? Observed this on v4.14 too but didn't pay much attention until realised that things look definitely wrong. > Out of curiosity, what is the expected use case for BBR here? Nothing special, just assumed it could be set as a default for both WAN and LAN usage. Regards, Oleksandr
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi. On pátek 16. února 2018 17:33:48 CET Neal Cardwell wrote: > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We > have not run into this one in our team, but we will try to work with you to > fix this. > > Would you be able to take a sender-side tcpdump trace of the slow BBR > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be > fine. Maybe something like: > > tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT So, going on with two real HW hosts. They are both running latest stock Arch Linux kernel (4.15.3-1-ARCH, CONFIG_PREEMPT=y, CONFIG_HZ=1000) and are interconnected with 1 Gbps link (via switch if that matters). Using iperf3, running each test for 20 seconds. Having BBR+fq_codel (or pfifo_fast, same result) on both hosts: Client to server: 112 Mbits/sec Server to client: 96.1 Mbits/sec Having BBR+fq on both hosts: Client to server: 347 Mbits/sec Server to client: 397 Mbits/sec Having YeAH+fq on both hosts: Client to server: 928 Mbits/sec Server to client: 711 Mbits/sec (when the server generates traffic, the throughput is a little bit lower, as you can see, but I assume that's because I have there low-power Silvermont CPU, when the client has Ivy Bridge beast) Now, to tcpdump. I've captured it 2 times, for client-to-server flow (c2s) and for server-to-client flow (s2c) while using BBR + pfifo_fast: # tcpdump -w test_XXX.pcap -c100 -s 100 -i enp2s0 port 5201 I've uploaded both files here [1]. Thanks. Oleksandr [1] https://natalenko.name/myfiles/bbr/
Re: [RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case
On 02/16/2018 07:41 AM, Jesper Dangaard Brouer wrote: > On Fri, 16 Feb 2018 13:31:37 +0800 > Jason Wangwrote: > >> On 2018年02月16日 06:43, Jesper Dangaard Brouer wrote: >>> The virtio_net code have three different RX code-paths in receive_buf(). >>> Two of these code paths can handle XDP, but one of them is broken for >>> at least XDP_REDIRECT. >>> >>> Function(1): receive_big() does not support XDP. >>> Function(2): receive_small() support XDP fully and uses build_skb(). >>> Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb(). >>> >>> The simple explanation is that receive_mergeable() is broken because >>> it uses napi_alloc_skb(), which violates XDP given XDP assumes packet >>> header+data in single page and enough tail room for skb_shared_info. >>> >>> The longer explaination is that receive_mergeable() tries to >>> work-around and satisfy these XDP requiresments e.g. by having a >>> function xdp_linearize_page() that allocates and memcpy RX buffers >>> around (in case packet is scattered across multiple rx buffers). This >>> does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because >>> we have not implemented bpf_xdp_adjust_tail yet). >>> >>> The XDP_REDIRECT action combined with cpumap is broken, and cause hard >>> to debug crashes. The main issue is that the RX packet does not have >>> the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing >>> skb_shared_info to overlap the next packets head-room (in which cpumap >>> stores info). >>> >>> Reproducing depend on the packet payload length and if RX-buffer size >>> happened to have tail-room for skb_shared_info or not. But to make >>> this even harder to troubleshoot, the RX-buffer size is runtime >>> dynamically change based on an Exponentially Weighted Moving Average >>> (EWMA) over the packet length, when refilling RX rings. >>> >>> This patch only disable XDP_REDIRECT support in receive_mergeable() >>> case, because it can cause a real crash. >>> >>> But IMHO we should NOT support XDP in receive_mergeable() at all, >>> because the principles behind XDP are to gain speed by (1) code >>> simplicity, (2) sacrificing memory and (3) where possible moving >>> runtime checks to setup time. These principles are clearly being >>> violated in receive_mergeable(), that e.g. runtime track average >>> buffer size to save memory consumption. >> >> I agree to disable it for -net now. > > Okay... I'll send an official patch later. > >> For net-next, we probably can do: >> >> - drop xdp_linearize_page() and do XDP through generic XDP helper >> after skb was built > > I disagree strongly here - it makes no sense. > > Why do you want to explicit fallback to Generic-XDP? > (... then all the performance gain is gone!) > And besides, a couple of function calls later, the generic XDP code > will/can get invoked anyhow... > Hi, Can we get EWMA to ensure for majority of cases we have the extra head room? Seems we could just over-estimate the size by N-bytes. In some cases we may under-estimate and then would need to fall back to generic-xdp or otherwise growing the buffer which of course would be painful and slow, but presumably would happen rarely. I think it would be much better to keep this feature vs kill it and make its configuration even more painful to get XDP working on virtio. Disabling EWMA also seems reasonable to me. > > Take a step back: > What is the reason/use-case for implementing XDP inside virtio_net? > > From a DDoS/performance perspective XDP in virtio_net happens on the > "wrong-side" as it is activated _inside_ the guest OS, which is too > late for a DDoS filter, as the guest kick/switch overhead have already > occurred. > The hypervisor may not "know" how to detect DDOS if its specific to the VMs domain. In these cases we aren't measuring pps but are looking at cycles/packet. In this case dropping cycles/packet frees up CPU for useful work. Here I expect we can see real CPU % drop in VM by using XDP. The other use case is once we have a fast path NIC to VM in kernel we can expect, from you numbers below, 3+Mpps. I seem to remember from Jason's netdevconf talk that he had some experimental numbers that were even better. The other case is the hypervisor is not Linux and is feeding packets even faster DPDK numbers, again from Jason's slides, seemed to show 11+Mpps. XDP makes a lot of sense here IMO. (those specific pps numbers I pulled from memory but the point is feeding many Mpps into a VM should be doable) The last thing is we may see hardware VFs emulating virtio at some point. Then XDP would be needed. With the newer virtio spec under development my impression is the hardware emulation piece is becoming more of a focus. But, someone actually working on that could probably provide a more informed comment. > I do use XDP_DROP inside the guest (driver virtio_net), but just to > perform what I can zoom-in benchmarking, for perf-record isolating the > early RX
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On 02/16/18 17:56, Neal Cardwell wrote: > On Fri, Feb 16, 2018 at 11:26 AM, Holger Hoffstätte >wrote: >> >> BBR in general will run with lower cwnd than e.g. Cubic or others. >> That's a feature and necessary for WAN transfers. > > Please note that there's no general rule about whether BBR will run > with a lower or higher cwnd than CUBIC, Reno, or other loss-based > congestion control algorithms. Whether BBR's cwnd will be lower or > higher depends on the BDP of the path, the amount of buffering in the > bottleneck, and the number of flows. BBR tries to match the amount of > in-flight data to the BDP based on the available bandwidth and the > two-way propagation delay. This will usually produce an amount of data > in flight that is smaller than CUBIC/Reno (yielding lower latency) if > the path has deep buffers (bufferbloat), but can be larger than > CUBIC/Reno (yielding higher throughput) if the buffers are shallow and > the traffic is suffering burst losses. In all my tests I've never seen it larger, but OK. Thanks for the explanation. :) On second reading the "necessary for WAN transfers" was phrased a bit unfortunately, but it likely doesn't matter for Oleksandr's case anyway.. (snip) >> Something seems really wrong with your setup. I get completely >> expected throughput on wired 1Gb between two hosts: >> >> Connecting to host tux, port 5201 >> [ 5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201 >> [ ID] Interval Transfer Bitrate Retr Cwnd >> [ 5] 0.00-1.00 sec 113 MBytes 948 Mbits/sec0204 KBytes >> [ 5] 1.00-2.00 sec 112 MBytes 941 Mbits/sec0204 KBytes >> [ 5] 2.00-3.00 sec 112 MBytes 941 Mbits/sec0204 KBytes >> [...] >> >> Running it locally gives the more or less expected results as well: >> >> Connecting to host ragnarok, port 5201 >> [ 5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201 >> [ ID] Interval Transfer Bitrate Retr Cwnd >> [ 5] 0.00-1.00 sec 8.09 GBytes 69.5 Gbits/sec0512 KBytes >> [ 5] 1.00-2.00 sec 8.14 GBytes 69.9 Gbits/sec0512 KBytes >> [ 5] 2.00-3.00 sec 8.43 GBytes 72.4 Gbits/sec0512 KBytes >> [...] >> >> Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere). > > Can you please clarify if this is over bare metal or between VM > guests? It sounds like Oleksandr's initial report was between KVM VMs, > so the virtualization may be an ingredient here. These are real hosts, not VMs, wired by 1Gbit Ethernet (home office). Like Eric said it's probably weird HZ, slow host, iffy high-res timer (bad for both fq and fq_codel), overhead of retpoline in a VM or whatnot. cheers Holger
[PATCH v3 1/2] dt-bindings: net: Add bindings for National Instruments XGE netdev
This adds bindings for the NI XGE 1G/10G network device. Signed-off-by: Moritz Fischer--- Changes from v2: - Addressed Rob's comments w.r.t to IRQ names and typo Changes from v1: - Corrected from nixge -> nixge.txt --- Documentation/devicetree/bindings/net/nixge.txt | 32 + 1 file changed, 32 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/nixge.txt diff --git a/Documentation/devicetree/bindings/net/nixge.txt b/Documentation/devicetree/bindings/net/nixge.txt new file mode 100644 index ..e55af7f0881a --- /dev/null +++ b/Documentation/devicetree/bindings/net/nixge.txt @@ -0,0 +1,32 @@ +* NI XGE Ethernet controller + +Required properties: +- compatible: Should be "ni,xge-enet-2.00" +- reg: Address and length of the register set for the device +- interrupts: Should contain tx and rx interrupt +- interrupt-names: Should be "rx" and "tx" +- phy-mode: See ethernet.txt file in the same directory. +- phy-handle: See ethernet.txt file in the same directory. +- nvmem-cells: Phandle of nvmem cell containing the MAC address +- nvmem-cell-names: Should be "address" + +Examples (10G generic PHY): + nixge0: ethernet@4000 { + compatible = "ni,xge-enet-2.00"; + reg = <0x4000 0x6000>; + + nvmem-cells = <_addr>; + nvmem-cell-names = "address"; + + interrupts = <0 29 IRQ_TYPE_LEVEL_HIGH>, <0 30 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "rx", "tx"; + interrupt-parent = <>; + + phy-mode = "xgmii"; + phy-handle = <_phy1>; + + ethernet_phy1: ethernet-phy@4 { + compatible = "ethernet-phy-ieee802.3-c45"; + reg = <4>; + }; + }; -- 2.16.1
[PATCH v3 2/2] net: ethernet: nixge: Add support for National Instruments XGE netdev
Add support for the National Instruments XGE 1/10G network device. It uses the EEPROM on the board via NVMEM. Signed-off-by: Moritz Fischer--- Changes from v2: - Implement recv side NAPI - Improved error handling - Implemented C45 writes - Added ethtool callbacks & blink functionality - Improved nixge_ctrl_poll_timeout() macro - Removed dev_dbg() for mdio accesses - Added businfo to ethtool drvinfo Changes from v1: - Added dependency on ARCH_ZYNQ (Kbuild) - Removed unused variables - Use of_phy_connect as suggested - Removed masking of (un)supported modes - Added #define for some constants - Removed empty pm functions - Reworked mac_address handling - Made nixge_mdio_*() static (sparse) - Removed driver version - Addressed timeout loop - Adressed return values on timeout --- drivers/net/ethernet/Kconfig |1 + drivers/net/ethernet/Makefile|1 + drivers/net/ethernet/ni/Kconfig | 27 + drivers/net/ethernet/ni/Makefile |1 + drivers/net/ethernet/ni/nixge.c | 1352 ++ 5 files changed, 1382 insertions(+) create mode 100644 drivers/net/ethernet/ni/Kconfig create mode 100644 drivers/net/ethernet/ni/Makefile create mode 100644 drivers/net/ethernet/ni/nixge.c diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index b6cf4b6962f5..908218561fdd 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -129,6 +129,7 @@ config FEALNX source "drivers/net/ethernet/natsemi/Kconfig" source "drivers/net/ethernet/netronome/Kconfig" +source "drivers/net/ethernet/ni/Kconfig" source "drivers/net/ethernet/8390/Kconfig" config NET_NETX diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index 3cdf01e96e0b..d732e9522b76 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_NET_VENDOR_MYRI) += myricom/ obj-$(CONFIG_FEALNX) += fealnx.o obj-$(CONFIG_NET_VENDOR_NATSEMI) += natsemi/ obj-$(CONFIG_NET_VENDOR_NETRONOME) += netronome/ +obj-$(CONFIG_NET_VENDOR_NI) += ni/ obj-$(CONFIG_NET_NETX) += netx-eth.o obj-$(CONFIG_NET_VENDOR_NUVOTON) += nuvoton/ obj-$(CONFIG_NET_VENDOR_NVIDIA) += nvidia/ diff --git a/drivers/net/ethernet/ni/Kconfig b/drivers/net/ethernet/ni/Kconfig new file mode 100644 index ..cd30f7de16de --- /dev/null +++ b/drivers/net/ethernet/ni/Kconfig @@ -0,0 +1,27 @@ +# +# National Instuments network device configuration +# + +config NET_VENDOR_NI + bool "National Instruments Devices" + default y + ---help--- + If you have a network (Ethernet) device belonging to this class, say Y. + + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about National Instrument devices. + If you say Y, you will be asked for your specific device in the + following questions. + +if NET_VENDOR_NI + +config NI_XGE_MANAGEMENT_ENET + tristate "National Instruments XGE management enet support" + depends on ARCH_ZYNQ + select PHYLIB + ---help--- + Simple LAN device for debug or management purposes. Can + support either 10G or 1G PHYs via SFP+ ports. + +endif diff --git a/drivers/net/ethernet/ni/Makefile b/drivers/net/ethernet/ni/Makefile new file mode 100644 index ..99c664651c51 --- /dev/null +++ b/drivers/net/ethernet/ni/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_NI_XGE_MANAGEMENT_ENET) += nixge.o diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c new file mode 100644 index ..9b255c23d7cd --- /dev/null +++ b/drivers/net/ethernet/ni/nixge.c @@ -0,0 +1,1352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2016-2017, National Instruments Corp. + * + * Author: Moritz Fischer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TX_BD_NUM 64 +#define RX_BD_NUM 128 + +/* Axi DMA Register definitions */ + +#define XAXIDMA_TX_CR_OFFSET 0x /* Channel control */ +#define XAXIDMA_TX_SR_OFFSET 0x0004 /* Status */ +#define XAXIDMA_TX_CDESC_OFFSET0x0008 /* Current descriptor pointer */ +#define XAXIDMA_TX_TDESC_OFFSET0x0010 /* Tail descriptor pointer */ + +#define XAXIDMA_RX_CR_OFFSET 0x0030 /* Channel control */ +#define XAXIDMA_RX_SR_OFFSET 0x0034 /* Status */ +#define XAXIDMA_RX_CDESC_OFFSET0x0038 /* Current descriptor pointer */ +#define XAXIDMA_RX_TDESC_OFFSET0x0040 /* Tail descriptor pointer */ + +#define XAXIDMA_CR_RUNSTOP_MASK0x0001 /* Start/stop DMA channel */ +#define XAXIDMA_CR_RESET_MASK 0x0004 /* Reset DMA engine */ + +#define XAXIDMA_BD_NDESC_OFFSET0x00 /* Next
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi! On pátek 16. února 2018 17:45:56 CET Neal Cardwell wrote: > Eric raises a good question: bare metal vs VMs. > > Oleksandr, your first email mentioned KVM VMs and virtio NICs. Your > second e-mail did not seem to mention if those results were for bare > metal or a VM scenario: can you please clarify the details on your > second set of tests? Ugh, so many letters simultaneously… I'll answer them one by one if you don't mind :). Both the first and the second set of tests were performed on 2 KVM VMs, but from now I'll test everything using real HW only to exclude potential influence of virtualisation. Also, as I've already pointed out, on the real HW the difference is even bigger (~10 times). Now, I'm going to answer other emails of yours, including the actual results from the real HW and tcpdump output as requested. Thanks! Regards, Oleksandr
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 11:26 AM, Holger Hoffstättewrote: > > BBR in general will run with lower cwnd than e.g. Cubic or others. > That's a feature and necessary for WAN transfers. Please note that there's no general rule about whether BBR will run with a lower or higher cwnd than CUBIC, Reno, or other loss-based congestion control algorithms. Whether BBR's cwnd will be lower or higher depends on the BDP of the path, the amount of buffering in the bottleneck, and the number of flows. BBR tries to match the amount of in-flight data to the BDP based on the available bandwidth and the two-way propagation delay. This will usually produce an amount of data in flight that is smaller than CUBIC/Reno (yielding lower latency) if the path has deep buffers (bufferbloat), but can be larger than CUBIC/Reno (yielding higher throughput) if the buffers are shallow and the traffic is suffering burst losses. > >>> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput >>> to ~100 Mbps (verifiable not only by iperf3, but also by scp while >>> transferring some files between hosts). > > Something seems really wrong with your setup. I get completely > expected throughput on wired 1Gb between two hosts: > > Connecting to host tux, port 5201 > [ 5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201 > [ ID] Interval Transfer Bitrate Retr Cwnd > [ 5] 0.00-1.00 sec 113 MBytes 948 Mbits/sec0204 KBytes > [ 5] 1.00-2.00 sec 112 MBytes 941 Mbits/sec0204 KBytes > [ 5] 2.00-3.00 sec 112 MBytes 941 Mbits/sec0204 KBytes > [...] > > Running it locally gives the more or less expected results as well: > > Connecting to host ragnarok, port 5201 > [ 5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201 > [ ID] Interval Transfer Bitrate Retr Cwnd > [ 5] 0.00-1.00 sec 8.09 GBytes 69.5 Gbits/sec0512 KBytes > [ 5] 1.00-2.00 sec 8.14 GBytes 69.9 Gbits/sec0512 KBytes > [ 5] 2.00-3.00 sec 8.43 GBytes 72.4 Gbits/sec0512 KBytes > [...] > > Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere). Can you please clarify if this is over bare metal or between VM guests? It sounds like Oleksandr's initial report was between KVM VMs, so the virtualization may be an ingredient here. thanks, neal
Re: [PATCH iproute2-next v5 0/9] ipaddress: Make print_linkinfo_brief() static
David Ahern wrote: > On 2/15/18 2:23 PM, Serhey Popovych wrote: >> With this series I propose to make print_linkinfo_brief() static in >> favor of print_linkinfo() as single point for linkinfo printing. >> > > ... > >> >> Thanks, >> Serhii >> >> Serhey Popovych (9): >> ipaddress: Abstract IFA_LABEL matching code >> ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name() >> utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n() >> ipaddress: Improve print_linkinfo() >> ipaddress: Simplify print_linkinfo_brief() and it's usage >> lib: Correct object file dependencies >> utils: Introduce and use get_ifname_rta() >> utils: Introduce and use print_name_and_link() to print name@link >> ipaddress: Make print_linkinfo_brief() static >> >> Makefile |2 +- >> bridge/link.c| 21 ++--- >> include/ll_map.h |4 +- >> include/utils.h |5 ++ >> ip/ip_common.h |2 - >> ip/ipaddress.c | 231 >> ++ >> ip/iplink.c |5 +- >> lib/Makefile |4 +- >> lib/ll_map.c | 31 +--- >> lib/utils.c | 68 >> 10 files changed, 167 insertions(+), 206 deletions(-) >> > > Series applied after the pretty fixup from a separate patch. > > Serhey: please cc me on patches for iproute2-next; I have a separate > email account for netdev mail and at times it gets way behind the netdev > list. > Accepted, will cc when sending next patches. Thank you. signature.asc Description: OpenPGP digital signature
[PATCH][V2] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift
From: Colin Ian KingThe shifting of timehi by 16 bits to the left will be promoted to a 32 bit signed int and then sign-extended to an u64. If the top bit of timehi is set then all then all the upper bits of ns end up as also being set because of the sign-extension. Fix this by making timehi and timelo u64. Also move the declaration of ns. Detected by CoverityScan, CID#1465288 ("Unintended sign extension") Fixes: c6fe0ad2c349 ("net: dsa: mv88e6xxx: add rx/tx timestamping support") Signed-off-by: Colin Ian King --- drivers/net/dsa/mv88e6xxx/hwtstamp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c index b251d534b70d..2149d332dea0 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c @@ -293,7 +293,8 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip, struct sk_buff *skb, u16 reg, struct sk_buff_head *rxq) { - u16 buf[4] = { 0 }, status, timelo, timehi, seq_id; + u16 buf[4] = { 0 }, status, seq_id; + u64 ns, timelo, timehi; struct skb_shared_hwtstamps *shwt; int err; @@ -321,7 +322,7 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip, */ for ( ; skb; skb = skb_dequeue(rxq)) { if (mv88e6xxx_ts_valid(status) && seq_match(skb, seq_id)) { - u64 ns = timehi << 16 | timelo; + ns = timehi << 16 | timelo; mutex_lock(>reg_lock); ns = timecounter_cyc2time(>tstamp_tc, ns); -- 2.15.1
Re: [PATCH] i40evf: remove redundant array comparisons to 0 checks
On 16/02/18 16:51, Andy Shevchenko wrote: > On Thu, Feb 15, 2018 at 9:42 PM, Colin Kingwrote: >> From: Colin Ian King >> >> The checks to see if key->dst.s6_addr and key->src.s6_addr are null >> pointers are redundant because these are constant size arrays and >> so the checks always return true. Fix this by removing the redundant >> checks. > >> + for (i = 0; i < 4; i++) > >> + filter->f.mask.tcp_spec.dst_ip[i] |= >> >> cpu_to_be32(0x); > > Can it be one line then? I re-adjusted the text because checkpatch was complaining. > >> + memcpy(>f.data.tcp_spec.dst_ip, >> + >dst.s6_addr32, > > Ditto. > >> + sizeof(filter->f.data.tcp_spec.dst_ip)); >> + >> + for (i = 0; i < 4; i++) > >> + filter->f.mask.tcp_spec.src_ip[i] |= >> >> cpu_to_be32(0x); > > Ditto. > >> + memcpy(>f.data.tcp_spec.src_ip, >> + >src.s6_addr32, > > Ditto. > >> + sizeof(filter->f.data.tcp_spec.src_ip)); >
Re: [PATCH RFC 0/4] net: add bpfilter
Hi Florian, thanks for your feedback! More inline: On 02/16/2018 03:57 PM, Florian Westphal wrote: > Daniel Borkmannwrote: >> This is a very rough and early proof of concept that implements bpfilter. > > [..] > >> Also, as a benefit from such design, we get BPF JIT compilation on x86_64, >> arm64, ppc64, sparc64, mips64, s390x and arm32, but also rule offloading >> into HW for free for Netronome NFP SmartNICs that are already capable of >> offloading BPF since we can reuse all existing BPF infrastructure as the >> back end. The user space iptables binary issuing rule addition or dumps was >> left as-is, thus at some point any binaries against iptables uapi kernel >> interface could transparently be supported in such manner in long term. >> >> As rule translation can potentially become very complex, this is performed >> entirely in user space. In order to ease deployment, request_module() code >> is extended to allow user mode helpers to be invoked. Idea is that user mode >> helpers are built as part of the kernel build and installed as traditional >> kernel modules with .ko file extension into distro specified location, >> such that from a distribution point of view, they are no different than >> regular kernel modules. Thus, allow request_module() logic to load such >> user mode helper (umh) binaries via: >> >> request_module("foo") -> >> call_umh("modprobe foo") -> >> sys_finit_module(FD of /lib/modules/.../foo.ko) -> >> call_umh(struct file) >> >> Such approach enables kernel to delegate functionality traditionally done >> by kernel modules into user space processes (either root or !root) and >> reduces security attack surface of such new code, meaning in case of >> potential bugs only the umh would crash but not the kernel. Another >> advantage coming with that would be that bpfilter.ko can be debugged and >> tested out of user space as well (e.g. opening the possibility to run >> all clang sanitizers, fuzzers or test suites for checking translation). > > Several questions spinning at the moment, I will probably come up with > more: Sure, no problem at all. It's an early RFC, so purpose is to get a discussion going on such potential approach. > 1. Does this still attach the binary blob to the 'normal' iptables >hooks? Yeah, so thought would be to keep the user land tooling functional as is w/o having to recompile binaries, thus this would also need to attach for the existing hooks in order to keep semantics working. As a benefit in addition we can also reuse all the rest of the infrastructure to utilize things like XDP for iptables in the background, there is definitely flexibility on this side thus users could eventually benefit from this transparently and don't need to know that 'bpfilter' exists and is translating in the background. I realize taking this path is a long term undertake that we would need to tackle as a community, not just one or two individuals when we decide to go for this direction. > 2. If yes, do you see issues wrt. 'iptables' and 'bpfilter' attached > programs being different in nature (e.g. changed by different entities)? There could certainly be multiple options, e.g. a fall-through with state transfer once a request cannot be handled yet or a sysctl with iptables being the default handler and an option to switch to bpfilter for letting it handle requests for that time being. > 3. What happens if the rule can't be translated (yet?) (See above.) > 4. Do you plan to reimplement connection tracking in userspace? One option could be to have a generic, skb-less connection tracker in kernel that can be reused from the various hooks it would need to handle, potentially that would also be able to get offloaded into HW as another benefit coming out from that. > If no, how will the bpf program interact with it? > [ same question applies to ipv6 exthdr traversal, ip defragmentation > and the like ]. The v6 exthdr traversal could be realized natively via BPF which should make the parsing more robust at the same time than having it somewhere inside a helper in kernel directly; bounded loops in BPF would help as well on that front, similarly for defrag this could be handled by the prog although here we would need additional infra to queue the packets and then recirculate. > I will probably have a quadrillion of followup questions, sorry :-/ Definitely, please do! Thanks, Daniel >> Also, such architecture makes the kernel/user boundary very precise, >> meaning requests can be handled and BPF translated in control plane part >> in user space with its own user memory etc, while minimal data plane >> bits are in kernel. It would also allow to remove old xtables modules >> at some point from the kernel while keeping functionality in place. > > This is what we tried with nftables :-/
Re: [PATCH] i40evf: remove redundant array comparisons to 0 checks
On Thu, Feb 15, 2018 at 9:42 PM, Colin Kingwrote: > From: Colin Ian King > > The checks to see if key->dst.s6_addr and key->src.s6_addr are null > pointers are redundant because these are constant size arrays and > so the checks always return true. Fix this by removing the redundant > checks. > + for (i = 0; i < 4; i++) > + filter->f.mask.tcp_spec.dst_ip[i] |= > > cpu_to_be32(0x); Can it be one line then? > + memcpy(>f.data.tcp_spec.dst_ip, > + >dst.s6_addr32, Ditto. > + sizeof(filter->f.data.tcp_spec.dst_ip)); > + > + for (i = 0; i < 4; i++) > + filter->f.mask.tcp_spec.src_ip[i] |= > > cpu_to_be32(0x); Ditto. > + memcpy(>f.data.tcp_spec.src_ip, > + >src.s6_addr32, Ditto. > + sizeof(filter->f.data.tcp_spec.src_ip)); -- With Best Regards, Andy Shevchenko
Re: Serious performance degradation in Linux 4.15
On Fri, Feb 16, 2018 at 02:38:39PM +, Matt Fleming wrote: > On Wed, 14 Feb, at 10:46:20PM, Matt Fleming wrote: > > Here's some more numbers. This is with RETPOLINE=y but you'll see it > > doesn't make much of a difference. Oh, this is also with powersave > > cpufreq governor. > > Feh, I was wrong. The differences in performance I see are entirely > due to CONFIG_RETPOLINE and CONFIG_PAGE_TABLE_ISOLATION being enabled. OK, so I'm not the one crazy person not being able to reproduce this :-) Lets wait for more details from the original submitter.
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 11:43 AM, Eric Dumazetwrote: > > On Fri, Feb 16, 2018 at 8:33 AM, Neal Cardwell wrote: > > Oleksandr, > > > > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We > > have not run into this one in our team, but we will try to work with you to > > fix this. > > > > Would you be able to take a sender-side tcpdump trace of the slow BBR > > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be > > fine. Maybe something like: > > > > tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT > > > > Thanks! > > neal > > On baremetal and using latest net tree, I get pretty normal results at > least, on 40Gbit NIC, Eric raises a good question: bare metal vs VMs. Oleksandr, your first email mentioned KVM VMs and virtio NICs. Your second e-mail did not seem to mention if those results were for bare metal or a VM scenario: can you please clarify the details on your second set of tests? Thanks!
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 8:33 AM, Neal Cardwellwrote: > Oleksandr, > > Thanks for the detailed report! Yes, this sounds like an issue in BBR. We > have not run into this one in our team, but we will try to work with you to > fix this. > > Would you be able to take a sender-side tcpdump trace of the slow BBR > transfer ("v4.13 + BBR + fq_codel == Not OK")? Packet headers only would be > fine. Maybe something like: > > tcpdump -w /tmp/test.pcap -c100 -s 100 -i eth0 port $PORT > > Thanks! > neal On baremetal and using latest net tree, I get pretty normal results at least, on 40Gbit NIC, with pfifo_fast, fq and fq_codel. # tc qd replace dev eth0 root fq # ./super_netperf 1 -H lpaa24 -- -K cubic 25627 # ./super_netperf 1 -H lpaa24 -- -K bbr 25897 # tc qd replace dev eth0 root fq_codel # ./super_netperf 1 -H lpaa24 -- -K cubic 22246 # ./super_netperf 1 -H lpaa24 -- -K bbr 25228 # tc qd replace dev eth0 root pfifo_fast # ./super_netperf 1 -H lpaa24 -- -K cubic 25454 # ./super_netperf 1 -H lpaa24 -- -K bbr 25508
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On 02/16/18 16:15, Oleksandr Natalenko wrote: > Hi, David, Eric, Neal et al. > > On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote: >> I've faced an issue with a limited TCP bandwidth between my laptop and a >> server in my 1 Gbps LAN while using BBR as a congestion control mechanism. >> To verify my observations, I've set up 2 KVM VMs with the following >> parameters: >> >> 1) Linux v4.15.3 >> 2) virtio NICs >> 3) 128 MiB of RAM >> 4) 2 vCPUs >> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz These are very odd configurations. :) Non-preempt/100 might well be too slow, whereas PREEMPT/1000 might simply have too much overhead. >> The VMs are interconnected via host bridge (-netdev bridge). I was running >> iperf3 in the default and reverse mode. Here are the results: >> >> 1) BBR on both VMs >> >> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes >> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes >> >> 2) Reno on both VMs >> >> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant) >> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant) >> >> 3) Reno on client, BBR on server >> >> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant) >> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes >> >> 4) BBR on client, Reno on server >> >> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes >> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant) >> >> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. BBR in general will run with lower cwnd than e.g. Cubic or others. That's a feature and necessary for WAN transfers. >> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput >> to ~100 Mbps (verifiable not only by iperf3, but also by scp while >> transferring some files between hosts). Something seems really wrong with your setup. I get completely expected throughput on wired 1Gb between two hosts: Connecting to host tux, port 5201 [ 5] local 192.168.100.223 port 48718 connected to 192.168.100.222 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 113 MBytes 948 Mbits/sec0204 KBytes [ 5] 1.00-2.00 sec 112 MBytes 941 Mbits/sec0204 KBytes [ 5] 2.00-3.00 sec 112 MBytes 941 Mbits/sec0204 KBytes [...] Running it locally gives the more or less expected results as well: Connecting to host ragnarok, port 5201 [ 5] local 192.168.100.223 port 54090 connected to 192.168.100.223 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 8.09 GBytes 69.5 Gbits/sec0512 KBytes [ 5] 1.00-2.00 sec 8.14 GBytes 69.9 Gbits/sec0512 KBytes [ 5] 2.00-3.00 sec 8.43 GBytes 72.4 Gbits/sec0512 KBytes [...] Both hosts running 4.14.x with bbr and fq_codel (default qdisc everywhere). In the past I only used BBR briefly for testing since at 1Gb speeds on my LAN it was actually slightly slower than Cubic (some of those bugs were recently addressed) and made no difference otherwise, even for uploads - which are capped by my 50/10 DSL anyway. Please note that BBR was developed to address the case of WAN transfers (or more precisely high BDP paths) which often suffer from TCP throughput collapse due to single packet loss events. While it might "work" in other scenarios as well, strictly speaking delay-based anything is increasingly less likely to work when there is no meaningful notion of delay - such as on a LAN. (yes, this is very simplified..) The BBR mailing list has several nice reports why the current BBR implementation (dubbed v1) has a few - sometimes severe - problems. These are being addressed as we speak. (let me know if you want some of those tech reports by email. :) >> Also, I've tried to use YeAH instead of Reno, and it gives me the same >> results as Reno (IOW, YeAH works fine too). >> >> Questions: >> >> 1) is this expected? >> 2) or am I missing some extra BBR tuneable? No, it should work out of the box. >> 3) if it is not a regression (I don't have any previous data to compare >> with), how can I fix this? >> 4) if it is a bug in BBR, what else should I provide or check for a proper >> investigation? > > I've played with BBR a little bit more and managed to narrow the issue down > to > the changes between v4.12 and v4.13. Here are my observations: > > v4.12 + BBR + fq_codel == OK > v4.12 + BBR + fq == OK > v4.13 + BBR + fq_codel == Not OK > v4.13 + BBR + fq == OK > > I think this has something to do with an internal TCP implementation for > pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to > allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the > throughput is high and saturates the link, but if another qdisc is in use, > for > instance, fq_codel, the throughput drops. Just to be sure, I've also tried > pfifo_fast instead of fq_codel with the same outcome resulting in the low > throughput. I'm
Re: TCP and BBR: reproducibly low cwnd and bandwidth
On Fri, Feb 16, 2018 at 7:15 AM, Oleksandr Natalenkowrote: > Hi, David, Eric, Neal et al. > > On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote: >> I've faced an issue with a limited TCP bandwidth between my laptop and a >> server in my 1 Gbps LAN while using BBR as a congestion control mechanism. >> To verify my observations, I've set up 2 KVM VMs with the following >> parameters: >> >> 1) Linux v4.15.3 >> 2) virtio NICs >> 3) 128 MiB of RAM >> 4) 2 vCPUs >> 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz >> >> The VMs are interconnected via host bridge (-netdev bridge). I was running >> iperf3 in the default and reverse mode. Here are the results: >> >> 1) BBR on both VMs >> >> upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes >> download: 3.39 Gbits/sec, cwnd ~ 320 KBytes >> >> 2) Reno on both VMs >> >> upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant) >> download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant) >> >> 3) Reno on client, BBR on server >> >> upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant) >> download: 3.45 Gbits/sec, cwnd ~ 320 KBytes >> >> 4) BBR on client, Reno on server >> >> upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes >> download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant) >> >> So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. >> If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput >> to ~100 Mbps (verifiable not only by iperf3, but also by scp while >> transferring some files between hosts). >> >> Also, I've tried to use YeAH instead of Reno, and it gives me the same >> results as Reno (IOW, YeAH works fine too). >> >> Questions: >> >> 1) is this expected? >> 2) or am I missing some extra BBR tuneable? >> 3) if it is not a regression (I don't have any previous data to compare >> with), how can I fix this? >> 4) if it is a bug in BBR, what else should I provide or check for a proper >> investigation? > > I've played with BBR a little bit more and managed to narrow the issue down to > the changes between v4.12 and v4.13. Here are my observations: > > v4.12 + BBR + fq_codel == OK > v4.12 + BBR + fq == OK > v4.13 + BBR + fq_codel == Not OK > v4.13 + BBR + fq == OK > > I think this has something to do with an internal TCP implementation for > pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to > allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the > throughput is high and saturates the link, but if another qdisc is in use, for > instance, fq_codel, the throughput drops. Just to be sure, I've also tried > pfifo_fast instead of fq_codel with the same outcome resulting in the low > throughput. > > Unfortunately, I do not know if this is something expected or should be > considered as a regression. Thus, asking for an advice. > > Ideas? The way TCP pacing works, it defaults to internal pacing using a hint stored in the socket. If you change the qdisc while flow is alive, result could be unexpected. (TCP socket remembers that one FQ was supposed to handle the pacing) What results do you have if you use standard pfifo_fast ? I am asking because TCP pacing relies on High resolution timers, and that might be weak on your VM.
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Lets CC BBR folks at Google, and remove the ones that probably have no idea. On Thu, 2018-02-15 at 21:42 +0100, Oleksandr Natalenko wrote: > Hello. > > I've faced an issue with a limited TCP bandwidth between my laptop and a > server in my 1 Gbps LAN while using BBR as a congestion control mechanism. To > verify my observations, I've set up 2 KVM VMs with the following parameters: > > 1) Linux v4.15.3 > 2) virtio NICs > 3) 128 MiB of RAM > 4) 2 vCPUs > 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz > > The VMs are interconnected via host bridge (-netdev bridge). I was running > iperf3 in the default and reverse mode. Here are the results: > > 1) BBR on both VMs > > upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes > download: 3.39 Gbits/sec, cwnd ~ 320 KBytes > > 2) Reno on both VMs > > upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant) > download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant) > > 3) Reno on client, BBR on server > > upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant) > download: 3.45 Gbits/sec, cwnd ~ 320 KBytes > > 4) BBR on client, Reno on server > > upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes > download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant) > > So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. > If > using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput to > ~100 Mbps (verifiable not only by iperf3, but also by scp while transferring > some files between hosts). > > Also, I've tried to use YeAH instead of Reno, and it gives me the same > results > as Reno (IOW, YeAH works fine too). > > Questions: > > 1) is this expected? > 2) or am I missing some extra BBR tuneable? > 3) if it is not a regression (I don't have any previous data to compare > with), > how can I fix this? > 4) if it is a bug in BBR, what else should I provide or check for a proper > investigation? > > Thanks. > > Regards, > Oleksandr > >
Re: [PATCH iproute2-next v5 0/9] ipaddress: Make print_linkinfo_brief() static
On 2/15/18 2:23 PM, Serhey Popovych wrote: > With this series I propose to make print_linkinfo_brief() static in > favor of print_linkinfo() as single point for linkinfo printing. > ... > > Thanks, > Serhii > > Serhey Popovych (9): > ipaddress: Abstract IFA_LABEL matching code > ipaddress: ll_map: Replace ll_idx_n2a() with ll_index_to_name() > utils: Reimplement ll_idx_n2a() and introduce ll_idx_a2n() > ipaddress: Improve print_linkinfo() > ipaddress: Simplify print_linkinfo_brief() and it's usage > lib: Correct object file dependencies > utils: Introduce and use get_ifname_rta() > utils: Introduce and use print_name_and_link() to print name@link > ipaddress: Make print_linkinfo_brief() static > > Makefile |2 +- > bridge/link.c| 21 ++--- > include/ll_map.h |4 +- > include/utils.h |5 ++ > ip/ip_common.h |2 - > ip/ipaddress.c | 231 > ++ > ip/iplink.c |5 +- > lib/Makefile |4 +- > lib/ll_map.c | 31 +--- > lib/utils.c | 68 > 10 files changed, 167 insertions(+), 206 deletions(-) > Series applied after the pretty fixup from a separate patch. Serhey: please cc me on patches for iproute2-next; I have a separate email account for netdev mail and at times it gets way behind the netdev list.
Re: [PATCH RFC 0/4] net: add bpfilter
Florian Westphalwrote: > Daniel Borkmann wrote: > Several questions spinning at the moment, I will probably come up with > more: ... and here there are some more ... One of the many pain points of xtables design is the assumption of 'used only by sysadmin'. This has not been true for a very long time, so by now iptables has this userspace lock (yes, its fugly workaround) to serialize concurrent iptables invocations in userspace. AFAIU the translate-in-userspace design now brings back the old problem of different tools overwriting each others iptables rules. Another question -- am i correct in that each rule manipulation would incur a 'recompilation'? Or are there different mini programs chained together? One of the nftables advantages is that (since rule representation in kernel is black-box from userspace point of view) is that the kernel can announce add/delete of rules or elements from nftables sets. Any particular reason why translating iptables rather than nftables (it should be possible to monitor the nftables changes that are announced by kernel and act on those)?
[PATCH v2] ravb: add support for changing MTU
Allow for changing the MTU within the limit of the maximum size of a descriptor (2048 bytes). Add the callback to change MTU from user-space and take the configurable MTU into account when configuring the hardware. Signed-off-by: Niklas Söderlund--- drivers/net/ethernet/renesas/ravb.h | 1 + drivers/net/ethernet/renesas/ravb_main.c | 34 +--- 2 files changed, 28 insertions(+), 7 deletions(-) * Changes since v1 - Fix spelling error. - s/le16_to_cpu(rx_desc->ds_cc)/priv->rx_buf_sz/. - Drop ETH_FCS_LEN + 16 from rx_buf_sz calculation as pointed out by Sergei. diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h index 96a27b00c90e212a..b81f4faf7b10114d 100644 --- a/drivers/net/ethernet/renesas/ravb.h +++ b/drivers/net/ethernet/renesas/ravb.h @@ -1018,6 +1018,7 @@ struct ravb_private { u32 dirty_rx[NUM_RX_QUEUE]; /* Producer ring indices */ u32 cur_tx[NUM_TX_QUEUE]; u32 dirty_tx[NUM_TX_QUEUE]; + u32 rx_buf_sz; /* Based on MTU+slack. */ struct napi_struct napi[NUM_RX_QUEUE]; struct work_struct work; /* MII transceiver section. */ diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index c87f57ca44371586..34e841306e04a3d3 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -238,7 +238,7 @@ static void ravb_ring_free(struct net_device *ndev, int q) le32_to_cpu(desc->dptr))) dma_unmap_single(ndev->dev.parent, le32_to_cpu(desc->dptr), -PKT_BUF_SZ, +priv->rx_buf_sz, DMA_FROM_DEVICE); } ring_size = sizeof(struct ravb_ex_rx_desc) * @@ -300,9 +300,9 @@ static void ravb_ring_format(struct net_device *ndev, int q) for (i = 0; i < priv->num_rx_ring[q]; i++) { /* RX descriptor */ rx_desc = >rx_ring[q][i]; - rx_desc->ds_cc = cpu_to_le16(PKT_BUF_SZ); + rx_desc->ds_cc = cpu_to_le16(priv->rx_buf_sz); dma_addr = dma_map_single(ndev->dev.parent, priv->rx_skb[q][i]->data, - PKT_BUF_SZ, + priv->rx_buf_sz, DMA_FROM_DEVICE); /* We just set the data size to 0 for a failed mapping which * should prevent DMA from happening... @@ -346,6 +346,10 @@ static int ravb_ring_init(struct net_device *ndev, int q) int ring_size; int i; + /* +16 gets room from the status from the card. */ + priv->rx_buf_sz = (ndev->mtu <= 1492 ? PKT_BUF_SZ : ndev->mtu) + + ETH_HLEN + VLAN_HLEN; + /* Allocate RX and TX skb rings */ priv->rx_skb[q] = kcalloc(priv->num_rx_ring[q], sizeof(*priv->rx_skb[q]), GFP_KERNEL); @@ -355,7 +359,7 @@ static int ravb_ring_init(struct net_device *ndev, int q) goto error; for (i = 0; i < priv->num_rx_ring[q]; i++) { - skb = netdev_alloc_skb(ndev, PKT_BUF_SZ + RAVB_ALIGN - 1); + skb = netdev_alloc_skb(ndev, priv->rx_buf_sz + RAVB_ALIGN - 1); if (!skb) goto error; ravb_set_buffer_align(skb); @@ -586,7 +590,7 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q) skb = priv->rx_skb[q][entry]; priv->rx_skb[q][entry] = NULL; dma_unmap_single(ndev->dev.parent, le32_to_cpu(desc->dptr), -PKT_BUF_SZ, +priv->rx_buf_sz, DMA_FROM_DEVICE); get_ts &= (q == RAVB_NC) ? RAVB_RXTSTAMP_TYPE_V2_L2_EVENT : @@ -619,11 +623,12 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q) for (; priv->cur_rx[q] - priv->dirty_rx[q] > 0; priv->dirty_rx[q]++) { entry = priv->dirty_rx[q] % priv->num_rx_ring[q]; desc = >rx_ring[q][entry]; - desc->ds_cc = cpu_to_le16(PKT_BUF_SZ); + desc->ds_cc = cpu_to_le16(priv->rx_buf_sz); if (!priv->rx_skb[q][entry]) { skb = netdev_alloc_skb(ndev, - PKT_BUF_SZ + RAVB_ALIGN - 1); + priv->rx_buf_sz + + RAVB_ALIGN - 1); if (!skb) break; /*
Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read
On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote: > On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote: > > _port_ is being used as index to array port_hwtstamp before verifying > > it is a non-negative number and a valid index at line 209 and 258: > > > > if (port < 0 || port >= mv88e6xxx_num_ports(chip)) > > > > Fix this by checking _port_ before using it as index to array > > port_hwtstamp. > > NAK. Port is already known to be valid in the callers. And so the real bug is the pointless range checking tests. I would welcome patches to remove those. Thanks, Richard
Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read
On Fri, Feb 16, 2018 at 07:48:46AM -0800, Richard Cochran wrote: > On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote: > > _port_ is being used as index to array port_hwtstamp before verifying > > it is a non-negative number and a valid index at line 209 and 258: > > > > if (port < 0 || port >= mv88e6xxx_num_ports(chip)) > > > > Fix this by checking _port_ before using it as index to array > > port_hwtstamp. > > NAK. Port is already known to be valid in the callers. Then we should take out the check. It is probably this check which is causing the false positives. Andrew
Re: [PATCH][next] net: dsa: mv88e6xxx: avoid unintended sign extension on a 16 bit shift
On Thu, Feb 15, 2018 at 09:27:57PM +0100, Andrew Lunn wrote: > Do you prefer this, or making timehi and timelo a u64? The latter. While you are at it, please move the definition of 'ns' to the start of the function. Thanks, Richard
Re: [RFC][PATCH bpf v2 1/2] bpf: allow 64-bit offsets for bpf function calls
Daniel Borkmann wrote: On 02/15/2018 05:25 PM, Daniel Borkmann wrote: On 02/13/2018 05:05 AM, Sandipan Das wrote: The imm field of a bpf_insn is a signed 32-bit integer. For JIT-ed bpf-to-bpf function calls, it stores the offset from __bpf_call_base to the start of the callee function. For some architectures, such as powerpc64, it was found that this offset may be as large as 64 bits because of which this cannot be accomodated in the imm field without truncation. To resolve this, we additionally make aux->func within each bpf_prog associated with the functions to point to the list of all function addresses determined by the verifier. We keep the value assigned to the off field of the bpf_insn as a way to index into aux->func and also set aux->func_cnt so that this can be used for performing basic upper bound checks for the off field. Signed-off-by: Sandipan Das--- v2: Make aux->func point to the list of functions determined by the verifier rather than allocating a separate callee list for each function. Approach looks good to me; do you know whether s390x JIT would have similar requirement? I think one limitation that would still need to be addressed later with such approach would be regarding the xlated prog dump in bpftool, see 'BPF calls via JIT' in 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump"). Any ideas for this (potentially if we could use off + imm for calls, we'd get to 48 bits, but that seems still not be enough as you say)? All good points. I'm not really sure how s390x works, so I can't comment on that, but I'm copying Michael Holzheu for his consideration. With the existing scheme, 48 bits won't be enough, so we rejected that approach. I can also see how this will be a problem with bpftool, but I haven't looked into it in detail. I wonder if we can annotate the output to indicate the function being referred to? One other random thought, although I'm not sure how feasible this is for ppc64 JIT to realize ... but idea would be to have something like the below: diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29ca920..daa7258 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -512,6 +512,11 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +void * __weak bpf_jit_image_alloc(unsigned long size) +{ + return module_alloc(size); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -525,7 +530,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - hdr = module_alloc(size); + hdr = bpf_jit_image_alloc(size); if (hdr == NULL) return NULL; And ppc64 JIT could override bpf_jit_image_alloc() in a similar way like some archs would override the module_alloc() helper through a custom implementation, usually via __vmalloc_node_range(), so we could perhaps fit the range for BPF JITed images in a way that they could use the 32bit imm in the end? There are not that many progs loaded typically, so the range could be a bit narrower in such case anyway. (Not sure if this would work out though, but I thought to bring it up.) That'd be a good option to consider. I don't think we want to allocate anything from the linear memory range since users could load unprivileged BPF programs and consume a lot of memory that way. I doubt if we can map vmalloc'ed memory into the 0xc0 address range, but I'm not entirely sure. Michael, Is the above possible? The question is if we can have BPF programs be allocated within 4GB of __bpf_call_base (which is a kernel symbol), so that calls to those programs can be encoded in a 32-bit immediate field in a BPF instruction. As an extension, we may be able to extend it to 48-bits by combining with another BPF instruction field (offset). In either case, the vmalloc'ed address range won't work. The alternative is to pass the full 64-bit address of the BPF program in an auxiliary field (as proposed in this patch set) but we need to fix it up for 'bpftool' as well. Thanks, Naveen
Re: [PATCH v2] net: dsa: mv88e6xxx: hwtstamp: fix potential negative array index read
On Thu, Feb 15, 2018 at 12:31:39PM -0600, Gustavo A. R. Silva wrote: > _port_ is being used as index to array port_hwtstamp before verifying > it is a non-negative number and a valid index at line 209 and 258: > > if (port < 0 || port >= mv88e6xxx_num_ports(chip)) > > Fix this by checking _port_ before using it as index to array > port_hwtstamp. NAK. Port is already known to be valid in the callers. See: *** net/dsa/slave.c: dsa_slave_ioctl[266] *** net/dsa/slave.c: dsa_skb_tx_timestamp[416] *** net/dsa/dsa.c:dsa_skb_defer_rx_timestamp[152] > Addresses-Coverity-ID: 1465287 ("Negative array index read") > Addresses-Coverity-ID: 1465291 ("Negative array index read") Please check the code before posting. These false positives are really annoying. Thanks, Richard
Re: [PATCH] inet: don't call skb_orphan if tproxy happens in layer 2
Hi Florian & Pablo, Thank your very much for your quick feedback. On 02/16/2018 12:28 PM, Pablo Neira Ayuso wrote: On Fri, Feb 16, 2018 at 12:07:06PM +0100, Florian Westphal wrote: Gregory Vander Schuerenwrote: [ cc netdev ] If sysctl bridge-nf-call-iptables is enabled, iptables chains are already traversed from the bridging code. In such case, tproxy already happened when reaching ip_rcv. Thus no need to call skb_orphan as this would actually undo tproxy. I don't like this because it adds yet another test in fastpath, and for a use case that has apparently never worked before. Agreed. I also thought this was not ideal but I did find another way to easily fix this. We noticed issues when using tproxy with net.bridge.bridge-nf-call-iptables enabled. In such case, ip_rcv() basically undo tproxy's job. The following patch proposes a fix. I question wheter its a good idea to mix tproxy with bridges. Tproxy relies on policy routing, but a bridge doesn't route :-) I guess you use bridge snat mac mangling to force local delivery of packets that are otherwise bridged? Indeed, we use DNAT MAC mangling. If yes, can you use ebtables brouting instead? This would bypass the bridge (so no iptables invocation from bridge prerouting anymore). We were actually pondering over the usage of MAC DNAT vs brouting. I'll thus follow your suggestion and use brouting instead then. We will try to get rid of nf-call-iptables eventually. Good to know! There might be (more complicated) ways to avoid this problem without adding code in normal network path, but lets check other options first. Agreed. If there's a fix for this, that should be away from the fast path, not in ip_rcv(). -- -- DISCLAIMER. This email and any files transmitted with it are confidential and intended solely for the use of the individual or entity to whom they are addressed. If you have received this email in error please notify the system manager. This message contains confidential information and is intended only for the individual named. If you are not the named addressee you should not disseminate, distribute or copy this e-mail. Please notify the sender immediately by e-mail if you have received this e-mail by mistake and delete this e-mail from your system. If you are not the intended recipient you are notified that disclosing, copying, distributing or taking any action in reliance on the contents of this information is strictly prohibited.
Re: [RFC net PATCH] virtio_net: disable XDP_REDIRECT in receive_mergeable() case
On Fri, 16 Feb 2018 13:31:37 +0800 Jason Wangwrote: > On 2018年02月16日 06:43, Jesper Dangaard Brouer wrote: > > The virtio_net code have three different RX code-paths in receive_buf(). > > Two of these code paths can handle XDP, but one of them is broken for > > at least XDP_REDIRECT. > > > > Function(1): receive_big() does not support XDP. > > Function(2): receive_small() support XDP fully and uses build_skb(). > > Function(3): receive_mergeable() broken XDP_REDIRECT uses napi_alloc_skb(). > > > > The simple explanation is that receive_mergeable() is broken because > > it uses napi_alloc_skb(), which violates XDP given XDP assumes packet > > header+data in single page and enough tail room for skb_shared_info. > > > > The longer explaination is that receive_mergeable() tries to > > work-around and satisfy these XDP requiresments e.g. by having a > > function xdp_linearize_page() that allocates and memcpy RX buffers > > around (in case packet is scattered across multiple rx buffers). This > > does currently satisfy XDP_PASS, XDP_DROP and XDP_TX (but only because > > we have not implemented bpf_xdp_adjust_tail yet). > > > > The XDP_REDIRECT action combined with cpumap is broken, and cause hard > > to debug crashes. The main issue is that the RX packet does not have > > the needed tail-room (SKB_DATA_ALIGN(skb_shared_info)), causing > > skb_shared_info to overlap the next packets head-room (in which cpumap > > stores info). > > > > Reproducing depend on the packet payload length and if RX-buffer size > > happened to have tail-room for skb_shared_info or not. But to make > > this even harder to troubleshoot, the RX-buffer size is runtime > > dynamically change based on an Exponentially Weighted Moving Average > > (EWMA) over the packet length, when refilling RX rings. > > > > This patch only disable XDP_REDIRECT support in receive_mergeable() > > case, because it can cause a real crash. > > > > But IMHO we should NOT support XDP in receive_mergeable() at all, > > because the principles behind XDP are to gain speed by (1) code > > simplicity, (2) sacrificing memory and (3) where possible moving > > runtime checks to setup time. These principles are clearly being > > violated in receive_mergeable(), that e.g. runtime track average > > buffer size to save memory consumption. > > I agree to disable it for -net now. Okay... I'll send an official patch later. > For net-next, we probably can do: > > - drop xdp_linearize_page() and do XDP through generic XDP helper > after skb was built I disagree strongly here - it makes no sense. Why do you want to explicit fallback to Generic-XDP? (... then all the performance gain is gone!) And besides, a couple of function calls later, the generic XDP code will/can get invoked anyhow... Take a step back: What is the reason/use-case for implementing XDP inside virtio_net? >From a DDoS/performance perspective XDP in virtio_net happens on the "wrong-side" as it is activated _inside_ the guest OS, which is too late for a DDoS filter, as the guest kick/switch overhead have already occurred. I do use XDP_DROP inside the guest (driver virtio_net), but just to perform what I can zoom-in benchmarking, for perf-record isolating the early RX code path in the guest. (Using iptables "raw" table drop is almost as useful for that purpose). The XDP ndo_xdp_xmit in tuntap/tun.c (that you also implemented) is significantly more interesting. As it allow us to skip large parts of the network stack and redirect from a physical device (ixgbe) into a guest device. Ran a benchmark: - 0.5 Mpps with normal code path into device with driver tun - 3.7 Mpps with XDP_REDIRECT from ixgbe into same device Plus, there are indications that 3.7Mpps is not the real limit, as guest CPU doing XDP_DROP is 75% idle... thus this is a likely a scheduling + queue size issue. > - disable EWMA when XDP is set and reserve enough tailroom. > > > > > Besides the described bug: > > > > Update(1): There is also a OOM leak in the XDP_REDIRECT code, which > > receive_small() is likely also affected by. > > > > Update(2): Also observed a guest crash when redirecting out an > > another virtio_net device, when device is down. > > Will have a look at these issues. (Holiday in china now, so will do it > after). -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Re: TCP and BBR: reproducibly low cwnd and bandwidth
Hi, David, Eric, Neal et al. On čtvrtek 15. února 2018 21:42:26 CET Oleksandr Natalenko wrote: > I've faced an issue with a limited TCP bandwidth between my laptop and a > server in my 1 Gbps LAN while using BBR as a congestion control mechanism. > To verify my observations, I've set up 2 KVM VMs with the following > parameters: > > 1) Linux v4.15.3 > 2) virtio NICs > 3) 128 MiB of RAM > 4) 2 vCPUs > 5) tested on both non-PREEMPT/100 Hz and PREEMPT/1000 Hz > > The VMs are interconnected via host bridge (-netdev bridge). I was running > iperf3 in the default and reverse mode. Here are the results: > > 1) BBR on both VMs > > upload: 3.42 Gbits/sec, cwnd ~ 320 KBytes > download: 3.39 Gbits/sec, cwnd ~ 320 KBytes > > 2) Reno on both VMs > > upload: 5.50 Gbits/sec, cwnd = 976 KBytes (constant) > download: 5.22 Gbits/sec, cwnd = 1.20 MBytes (constant) > > 3) Reno on client, BBR on server > > upload: 5.29 Gbits/sec, cwnd = 952 KBytes (constant) > download: 3.45 Gbits/sec, cwnd ~ 320 KBytes > > 4) BBR on client, Reno on server > > upload: 3.36 Gbits/sec, cwnd ~ 370 KBytes > download: 5.21 Gbits/sec, cwnd = 887 KBytes (constant) > > So, as you may see, when BBR is in use, upload rate is bad and cwnd is low. > If using real HW (1 Gbps LAN, laptop and server), BBR limits the throughput > to ~100 Mbps (verifiable not only by iperf3, but also by scp while > transferring some files between hosts). > > Also, I've tried to use YeAH instead of Reno, and it gives me the same > results as Reno (IOW, YeAH works fine too). > > Questions: > > 1) is this expected? > 2) or am I missing some extra BBR tuneable? > 3) if it is not a regression (I don't have any previous data to compare > with), how can I fix this? > 4) if it is a bug in BBR, what else should I provide or check for a proper > investigation? I've played with BBR a little bit more and managed to narrow the issue down to the changes between v4.12 and v4.13. Here are my observations: v4.12 + BBR + fq_codel == OK v4.12 + BBR + fq == OK v4.13 + BBR + fq_codel == Not OK v4.13 + BBR + fq == OK I think this has something to do with an internal TCP implementation for pacing, that was introduced in v4.13 (commit 218af599fa63) specifically to allow using BBR together with non-fq qdiscs. Once BBR relies on fq, the throughput is high and saturates the link, but if another qdisc is in use, for instance, fq_codel, the throughput drops. Just to be sure, I've also tried pfifo_fast instead of fq_codel with the same outcome resulting in the low throughput. Unfortunately, I do not know if this is something expected or should be considered as a regression. Thus, asking for an advice. Ideas? Thanks. Regards, Oleksandr