[RFC PATCH bpf-next v2 4/4] error-injection: Support fault injection framework
Support in-kernel fault-injection framework via debugfs. This allows you to inject a conditional error to specified function using debugfs interfaces. Signed-off-by: Masami Hiramatsu--- Documentation/fault-injection/fault-injection.txt |5 + kernel/Makefile |1 kernel/fail_function.c| 169 + lib/Kconfig.debug | 10 + 4 files changed, 185 insertions(+) create mode 100644 kernel/fail_function.c diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 918972babcd8..6243a588dd71 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -30,6 +30,11 @@ o fail_mmc_request injects MMC data errors on devices permitted by setting debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request +o fail_function + + injects error return on specific functions by setting debugfs entries + under /sys/kernel/debug/fail_function. No boot option supported. + Configure fault-injection capabilities behavior --- diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index ..203d3582487a --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +static DEFINE_MUTEX(fei_lock); +static struct { + struct kprobe kp; + unsigned long retval; + struct fault_attr attr; +} fei_attr = { + .kp = { .pre_handler = fei_kprobe_handler, }, + .retval = ~0UL, /* This indicates -1 in long/int return value */ + .attr = FAULT_ATTR_INITIALIZER, +}; + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + if (should_fail(_attr.attr, 1)) { + regs_set_return_value(regs, fei_attr.retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(_lock); + return *pos == 0 ? (void *)1 : NULL; +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return NULL; +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + if (fei_attr.kp.addr) + seq_printf(m, "%pf\n", fei_attr.kp.addr); + else + seq_puts(m, "# not specified\n"); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, _seq_ops); +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, +size_t count, loff_t *ppos) +{ + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + if (strlen(sym) == 0 || sym[0] == '0') { + if (fei_attr.kp.addr) { + unregister_kprobe(_attr.kp); + fei_attr.kp.addr = NULL; + } + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + + if (fei_attr.kp.addr) { +
[RFC PATCH bpf-next v2 3/4] error-injection: Separate error-injection from kprobe
Since error-injection framework is not limited to be used by kprobes, nor bpf. Other kernel subsystems can use it freely for checking safeness of error-injection, e.g. livepatch, ftrace etc. So this separate error-injection framework from kprobes. Some differences has been made: - "kprobe" word is removed from any APIs/structures. - BPF_ALLOW_ERROR_INJECTION() is renamed to ALLOW_ERROR_INJECTION() since it is not limited for BPF too. - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this feature. It is automatically enabled if the arch supports error injection feature for kprobe or ftrace etc. Signed-off-by: Masami Hiramatsu--- Changes in v2: - Fix the override function name to override_function_with_return() - Show only function name in the list, user don't have to care about it's size, since function override only happens at the entry. --- arch/Kconfig |2 arch/x86/Kconfig |2 arch/x86/include/asm/error-injection.h | 12 ++ arch/x86/kernel/kprobes/ftrace.c | 14 -- arch/x86/lib/Makefile |2 arch/x86/lib/error-inject.c| 19 +++ fs/btrfs/disk-io.c |2 fs/btrfs/free-space-cache.c|2 include/asm-generic/error-injection.h | 20 +++ include/asm-generic/vmlinux.lds.h | 14 +- include/linux/bpf.h| 12 -- include/linux/error-injection.h| 21 +++ include/linux/kprobes.h|1 include/linux/module.h |6 - kernel/kprobes.c | 163 -- kernel/module.c|8 + kernel/trace/Kconfig |2 kernel/trace/bpf_trace.c |2 kernel/trace/trace_kprobe.c|3 lib/Kconfig.debug |4 + lib/Makefile |1 lib/error-inject.c | 198 22 files changed, 300 insertions(+), 210 deletions(-) create mode 100644 arch/x86/include/asm/error-injection.h create mode 100644 arch/x86/lib/error-inject.c create mode 100644 include/asm-generic/error-injection.h create mode 100644 include/linux/error-injection.h create mode 100644 lib/error-inject.c diff --git a/arch/Kconfig b/arch/Kconfig index d3f4aaf9cb7a..97376accfb14 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,7 +196,7 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool -config HAVE_KPROBE_OVERRIDE +config HAVE_FUNCTION_ERROR_INJECTION bool config HAVE_NMI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 04d66e6fa447..fc519e3ae754 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,7 +154,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE - select HAVE_KPROBE_OVERRIDE + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h new file mode 100644 index ..6c2a133622f4 --- /dev/null +++ b/arch/x86/include/asm/error-injection.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include +#include +#include + +asmlinkage void just_return_func(void); +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 1ea748d682fd..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_ftrace_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)_func; -} -NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7b181b61170e..081f09435d28 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,8 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c new file mode 100644 index ..7b881d03d0dd --- /dev/null +++ b/arch/x86/lib/error-inject.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier:
[RFC PATCH bpf-next v2 2/4] tracing/kprobe: bpf: Compare instruction pointer with original one
Compare instruction pointer with original one on the stack instead using per-cpu bpf_kprobe_override flag. This patch also consolidates reset_current_kprobe() and preempt_enable_no_resched() blocks. Those can be done in one place. Signed-off-by: Masami Hiramatsu--- kernel/trace/bpf_trace.c|1 - kernel/trace/trace_kprobe.c | 21 +++-- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d663660f8392..cefa9b0e396c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { - __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); arch_ftrace_kprobe_override_function(regs); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 265e3e27e8dc..a7c7035963f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,8 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -1204,6 +1202,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int rctx; if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); int ret; ret = trace_call_bpf(call, regs); @@ -1211,12 +1210,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the * pt_regs, and if so clear the kprobe and return 1 so that we -* don't do the instruction skipping. Also reset our state so -* we are clean the next pass through. +* don't do the single stepping. +* The ftrace kprobe handler leaves it up to us to re-enable +* preemption here before returning if we've modified the ip. */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); + if (orig_ip != instruction_pointer(regs)) { reset_current_kprobe(); + preempt_enable_no_resched(); return 1; } if (!ret) @@ -1324,15 +1324,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) { + if (tk->tp.flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); - /* -* The ftrace kprobe handler leaves it up to us to re-enable -* preemption here before returning if we've modified the ip. -*/ - if (ret) - preempt_enable_no_resched(); - } #endif return ret; }
[RFC PATCH bpf-next v2 1/4] tracing/kprobe: bpf: Check error injectable event is on function entry
Check whether error injectable event is on function entry or not. Currently it checks the event is ftrace-based kprobes or not, but that is wrong. It should check if the event is on the entry of target function. Since error injection will override a function to just return with modified return value, that operation must be done before the target function starts making stackframe. As a side effect, bpf error injection is no need to depend on function-tracer. It can work with sw-breakpoint based kprobe events too. Signed-off-by: Masami Hiramatsu--- kernel/trace/Kconfig|2 -- kernel/trace/bpf_trace.c|6 +++--- kernel/trace/trace_kprobe.c |8 +--- kernel/trace/trace_probe.h | 12 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ae3a2d519e50..6400e1bf97c5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,9 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f6d2327ecb59..d663660f8392 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -800,11 +800,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event, int ret = -EEXIST; /* -* Kprobe override only works for ftrace based kprobes, and only if they -* are on the opt-in list. +* Kprobe override only works if they are on the function entry, +* and only if they are on the opt-in list. */ if (prog->kprobe_override && - (!trace_kprobe_ftrace(event->tp_event) || + (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 91f4b57dab82..265e3e27e8dc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -88,13 +88,15 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) +bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(>rp.kp); + + return kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.symbol_name, + tk->rp.kp.offset); } -int trace_kprobe_error_injectable(struct trace_event_call *call) +bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; unsigned long addr; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5e54d748c84c..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,8 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); -int trace_kprobe_error_injectable(struct trace_event_call *call); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8NULL @@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset) return NULL; } -static inline int trace_kprobe_ftrace(struct trace_event_call *call) +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - return 0; + return false; } -static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) { - return 0; + return false; } #endif /* CONFIG_KPROBE_EVENTS */
[RFC PATCH bpf-next v2 0/4] Separate error injection table from kprobes
Hi Josef and Alexei, Here are the 2nd version of patches to moving error injection table from kprobes. In this series I did a small fixes and add function-based fault injection. Here is the previous version: https://lkml.org/lkml/2017/12/22/554 There are 2 main reasons why I separate it from kprobes. - kprobes users can modify execution path not only at error-injection whitelist functions but also other functions. I don't like to suggest user that such limitation is from kprobes itself. - This error injection information is also useful for ftrace (function-hook) and livepatch. It should not be limited by CONFIG_KPROBES. So I introduced CONFIG_FUNCTION_ERROR_INJECTION for this feature. Also CONFIG_FAIL_FUNCTION is added, which provides function-based error injection interface via debugfs following fault-injection framework. See [4/4]. Any thoughts? BTW, I think we should add an error-range description in ALLOW_ERROR_INJECTION() macro. If user sets a success return value and override it by mistake, caller must break data or cause kernel panic. Thank you, --- Masami Hiramatsu (4): tracing/kprobe: bpf: Check error injectable event is on function entry tracing/kprobe: bpf: Compare instruction pointer with original one error-injection: Separate error-injection from kprobe error-injection: Support fault injection framework Documentation/fault-injection/fault-injection.txt |5 + arch/Kconfig |2 arch/x86/Kconfig |2 arch/x86/include/asm/error-injection.h| 12 + arch/x86/kernel/kprobes/ftrace.c | 14 - arch/x86/lib/Makefile |2 arch/x86/lib/error-inject.c | 19 ++ fs/btrfs/disk-io.c|2 fs/btrfs/free-space-cache.c |2 include/asm-generic/error-injection.h | 20 ++ include/asm-generic/vmlinux.lds.h | 14 + include/linux/bpf.h | 12 - include/linux/error-injection.h | 21 ++ include/linux/kprobes.h |1 include/linux/module.h|6 - kernel/Makefile |1 kernel/fail_function.c| 169 ++ kernel/kprobes.c | 163 - kernel/module.c |8 - kernel/trace/Kconfig |4 kernel/trace/bpf_trace.c |9 - kernel/trace/trace_kprobe.c | 32 +-- kernel/trace/trace_probe.h| 12 + lib/Kconfig.debug | 14 + lib/Makefile |1 lib/error-inject.c| 198 + 26 files changed, 506 insertions(+), 239 deletions(-) create mode 100644 arch/x86/include/asm/error-injection.h create mode 100644 arch/x86/lib/error-inject.c create mode 100644 include/asm-generic/error-injection.h create mode 100644 include/linux/error-injection.h create mode 100644 kernel/fail_function.c create mode 100644 lib/error-inject.c -- Masami Hiramatsu (Linaro)
Re: [PATCH net-next] virtio_net: Add ethtool stats
On 2017/12/26 1:47, Florian Fainelli wrote: > On December 25, 2017 3:45:36 AM PST, Toshiaki Makita >wrote: >> On 2017/12/25 3:16, Stephen Hemminger wrote: >>> On Wed, 20 Dec 2017 13:40:37 +0900 >>> Toshiaki Makita wrote: >>> + +static const struct virtnet_gstats virtnet_gstrings_stats[] = { + { "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) }, + { "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) }, + { "rx_bytes", VIRTNET_NETDEV_STAT(rx_bytes) }, + { "tx_bytes", VIRTNET_NETDEV_STAT(tx_bytes) }, + { "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) }, + { "rx_length_errors", VIRTNET_NETDEV_STAT(rx_length_errors) }, + { "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) }, + { "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) }, + { "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) }, +}; + >>> >>> Please do not merge pre-existing global stats into ethtool. >>> It just duplicates existing functionality. >> >> Thanks for the feedback. >> I thought it's handy to be able to check stats like this in ethtool as >> well. Some drivers like ixgbe and mlx5 do similar thing. >> I can remove these duplicate stats (unless anyone has objection). > > For existing drivers, ethtool stats are kind of user ABI, so once these stats > get in, we should not consider removing them. For new drivers or existing > drivers without ethtool stats, we can do things a bit better as Stephen > suggested. OK. will remove them in v2. Thanks, Toshiaki Makita
xfrm: Forbid state updates from changing encap type
Currently we allow state updates to competely replace the contents of x->encap. This is bad because on the user side ESP only sets up header lengths depending on encap_type once when the state is first created. This could result in the header lengths getting out of sync with the actual state configuration. In practice key managers will never do a state update to change the encapsulation type. Only the port numbers need to be changed as the peer NAT entry is updated. Therefore this patch adds a check in xfrm_state_update to forbid any changes to the encap_type. Signed-off-by: Herbert Xudiff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 065d896..dc1cdde 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1533,8 +1533,12 @@ int xfrm_state_update(struct xfrm_state *x) err = -EINVAL; spin_lock_bh(>lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { - if (x->encap && x1->encap) + if (x->encap && x1->encap && + x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + else if (x->encap || x1->encap) + goto fail; + if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } @@ -1551,6 +1555,8 @@ int xfrm_state_update(struct xfrm_state *x) x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } + +fail: spin_unlock_bh(>lock); xfrm_state_put(x1); -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[PATCH net-next 1/1] forcedeth: optimize the rx with likely
In the rx fastpath, the function netdev_alloc_skb rarely fails. Therefore, a likely() optimization is added to this error check conditional. CC: Srinivas EedaCC: Joe Jin CC: Junxiao Bi Signed-off-by: Zhu Yanjun --- drivers/net/ethernet/nvidia/forcedeth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 49d6d78..a79b9f8 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -1817,7 +1817,7 @@ static int nv_alloc_rx(struct net_device *dev) while (np->put_rx.orig != less_rx) { struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD); - if (skb) { + if (likely(skb)) { np->put_rx_ctx->skb = skb; np->put_rx_ctx->dma = dma_map_single(>pci_dev->dev, skb->data, @@ -1858,7 +1858,7 @@ static int nv_alloc_rx_optimized(struct net_device *dev) while (np->put_rx.ex != less_rx) { struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD); - if (skb) { + if (likely(skb)) { np->put_rx_ctx->skb = skb; np->put_rx_ctx->dma = dma_map_single(>pci_dev->dev, skb->data, -- 2.7.4
Re: iproute2 net-next
On Tue, 26 Dec 2017 06:47:43 +0200 Leon Romanovskywrote: > On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote: > > David Ahern has agreed to take over managing the net-next branch of > > iproute2. > > The new location is: > > https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/ > > > > In the past, I have accepted new features into iproute2 master branch, but > > am changing the policy so that outside of the merge window (up until -rc1) > > new features will get put into net-next to get some more review and testing > > time. This means that things like the proposed batch streaming mode will > > go through net-next. > > > > Hi Stephen, > > Did you consider to create one shared repo for the iproute2 to allow > multiple committers workflow? For now having separate trees is best, there is no need for multiple committers the load is very light. > It will be much convenient for the users to have one place for > master/stable/net-next branches, instead of actually following two > different repositories. If you are doing network development, you already need to deal with multiple repo's on the kernel side so there is no difference. > > Example, of such shared repo: > BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/ > Bluetooth: > https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/ > RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/ Most of these are high volume or vendor silo'd which is not the case here. pgpNi4qEun94v.pgp Description: OpenPGP digital signature
[PATCH net] net/sched: Fix update of lastuse in act modules implementing stats_update
We need to update lastuse to to the most updated value between what is already set and the new value. If HW matching fails, i.e. because of an issue, the stats are not updated but it could be that software did match and updated lastuse. Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument") Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to hardware") Signed-off-by: Roi DayanReviewed-by: Paul Blakey Acked-by: Jiri Pirko --- net/sched/act_gact.c | 2 +- net/sched/act_mirred.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e29a48e..a0ac42b 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8b3e5938..08b6184 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_t *tm = >tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, -- 2.7.5
Re: iproute2 net-next
On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote: > David Ahern has agreed to take over managing the net-next branch of iproute2. > The new location is: > https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/ > > In the past, I have accepted new features into iproute2 master branch, but > am changing the policy so that outside of the merge window (up until -rc1) > new features will get put into net-next to get some more review and testing > time. This means that things like the proposed batch streaming mode will > go through net-next. > Hi Stephen, Did you consider to create one shared repo for the iproute2 to allow multiple committers workflow? It will be much convenient for the users to have one place for master/stable/net-next branches, instead of actually following two different repositories. Example, of such shared repo: BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/ Bluetooth: https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/ RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/ Thanks > Thanks > Stephen Hemminger signature.asc Description: PGP signature
Re: [PATCH net-next v8 1/2] dt-bindings: net: add DT bindings for Socionext UniPhier AVE
On December 24, 2017 5:10:37 PM PST, Kunihiko Hayashiwrote: >DT bindings for the AVE ethernet controller found on Socionext's >UniPhier platforms. > >Signed-off-by: Kunihiko Hayashi >Signed-off-by: Jassi Brar >Acked-by: Rob Herring Reviewed-by: Florian Fainelli -- Florian
Re: [PATCH net] phylink: ensure AN is enabled
From: Russell KingDate: Wed, 20 Dec 2017 23:21:34 + > Ensure that we mark AN as enabled at boot time, rather than leaving > it disabled. This is noticable if your SFP module is fiber, and you > it supports faster speeds than 1G with 2.5G support in place. > > Signed-off-by: Russell King Also applied, and both of these phlink fixes are queued up for -stable as well.
Re: [PATCH net] phylink: ensure the PHY interface mode is appropriately set
From: Russell KingDate: Wed, 20 Dec 2017 23:21:28 + > When setting the ethtool settings, ensure that the validated PHY > interface mode is propagated to the current link settings, so that > 2500BaseX can be selected. > > Signed-off-by: Russell King Applied.
[RFC PATCH v12 3/5] mwifiex: Disable wakeup irq handling for pcie
We are going to handle the wakeup irq in the pci core. Signed-off-by: Jeffy Chen--- Changes in v13: None Changes in v12: None Changes in v11: None Changes in v10: None Changes in v9: None Changes in v8: None Changes in v7: None Changes in v6: None Changes in v5: None Changes in v3: None Changes in v2: None drivers/net/wireless/marvell/mwifiex/main.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index a96bd7e653bf..3cc3403b977a 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter *adapter) goto err_exit; adapter->dt_node = dev->of_node; + + if (adapter->iface_type != MWIFIEX_PCIE) + goto err_exit; + adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0); if (!adapter->irq_wakeup) { dev_dbg(dev, "fail to parse irq_wakeup from device tree\n"); -- 2.11.0
[RFC PATCH v12 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core
Currently we are handling wake irq in mrvl wifi driver. Move it into pci core. Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi). Changes in v13: Fix compiler error reported by kbuild test robotChanges in v12: Only add irq definitions for PCI devices and rewrite the commit message. Enable the wake irq in noirq stage to avoid possible irq storm. Changes in v11: Address Brian's comments. Only support 1-per-device PCIe WAKE# pin as suggested. Move to pcie port as Brian suggested. Changes in v10: Use device_set_wakeup_capable() instead of device_set_wakeup_enable(), since dedicated wakeirq will be lost in device_set_wakeup_enable(false). Changes in v9: Add section for PCI devices and rewrite the commit message. Fix check error in .cleanup(). Move dedicated wakeirq setup to setup() callback and use device_set_wakeup_enable() to enable/disable. Rewrite the commit message. Changes in v8: Add optional "pci", and rewrite commit message. Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal. Rewrite the commit message. Changes in v7: Move PCIE_WAKE handling into pci core. Changes in v6: Fix device_init_wake error handling, and add some comments. Changes in v5: Move to pci.txt Rebase. Use "wakeup" instead of "wake" Changes in v3: Fix error handling. Changes in v2: Use dev_pm_set_dedicated_wake_irq. Jeffy Chen (5): dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq of/irq: Adjust of_pci_irq parsing for multiple interrupts mwifiex: Disable wakeup irq handling for pcie PCI / PM: Add support for the PCIe WAKE# signal for OF arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru Documentation/devicetree/bindings/pci/pci.txt | 10 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 11 ++-- drivers/net/wireless/marvell/mwifiex/main.c | 4 ++ drivers/of/of_pci_irq.c | 71 +++-- drivers/pci/Makefile | 1 + drivers/pci/pci-driver.c | 10 drivers/pci/pci-of.c | 75 +++ include/linux/of_pci.h| 9 8 files changed, 183 insertions(+), 8 deletions(-) create mode 100644 drivers/pci/pci-of.c -- 2.11.0
[RFC PATCH v12 3/5] mwifiex: Disable wakeup irq handling for pcie
We are going to handle the wakeup irq in the pci core. Signed-off-by: Jeffy Chen--- Changes in v12: None Changes in v11: None Changes in v10: None Changes in v9: None Changes in v8: None Changes in v7: None Changes in v6: None Changes in v5: None Changes in v3: None Changes in v2: None drivers/net/wireless/marvell/mwifiex/main.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index a96bd7e653bf..3cc3403b977a 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter *adapter) goto err_exit; adapter->dt_node = dev->of_node; + + if (adapter->iface_type != MWIFIEX_PCIE) + goto err_exit; + adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0); if (!adapter->irq_wakeup) { dev_dbg(dev, "fail to parse irq_wakeup from device tree\n"); -- 2.11.0
[RFC PATCH v12 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core
Currently we are handling wake irq in mrvl wifi driver. Move it into pci core. Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi). Changes in v12: Enable the wake irq in noirq stage to avoid possible irq storm. Changes in v11: Only add irq definitions for PCI devices and rewrite the commit message. Address Brian's comments. Only support 1-per-device PCIe WAKE# pin as suggested. Move to pcie port as Brian suggested. Changes in v10: Use device_set_wakeup_capable() instead of device_set_wakeup_enable(), since dedicated wakeirq will be lost in device_set_wakeup_enable(false). Changes in v9: Add section for PCI devices and rewrite the commit message. Fix check error in .cleanup(). Move dedicated wakeirq setup to setup() callback and use device_set_wakeup_enable() to enable/disable. Rewrite the commit message. Changes in v8: Add optional "pci", and rewrite commit message. Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal. Rewrite the commit message. Changes in v7: Move PCIE_WAKE handling into pci core. Changes in v6: Fix device_init_wake error handling, and add some comments. Changes in v5: Move to pci.txt Rebase. Use "wakeup" instead of "wake" Changes in v3: Fix error handling. Changes in v2: Use dev_pm_set_dedicated_wake_irq. Jeffy Chen (5): dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq of/irq: Adjust of_pci_irq parsing for multiple interrupts mwifiex: Disable wakeup irq handling for pcie PCI / PM: Add support for the PCIe WAKE# signal for OF arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru Documentation/devicetree/bindings/pci/pci.txt | 10 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 11 ++-- drivers/net/wireless/marvell/mwifiex/main.c | 4 ++ drivers/of/of_pci_irq.c | 74 -- drivers/pci/Makefile | 1 + drivers/pci/pci-driver.c | 10 drivers/pci/pci-of.c | 75 +++ include/linux/of_pci.h| 9 8 files changed, 186 insertions(+), 8 deletions(-) create mode 100644 drivers/pci/pci-of.c -- 2.11.0
[PATCH V2 net-next] selftests/net: fix bugs in address and port initialization
Address/port initialization should work correctly regardless of the order in which command line arguments are supplied, E.g, cfg_port should be used to connect to the remote host even if it is processed after -D, src/dst address initialization should not require that [-4|-6] be specified before the -S or -D args, receiver should be able to bind to *. Achieve this by making sure that the address/port structures are initialized after all command line options are parsed. Store cfg_port in host-byte order, and use htons() to set up the sin_port/sin6_port before bind/connect, so that the network system calls get the correct values in network-byte order. Signed-off-by: Sowmini Varadhan--- v2: fix fragility in address initialization as well. tools/testing/selftests/net/msg_zerocopy.c | 21 +++-- 1 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 3ab6ec4..e11fe84 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -259,22 +259,28 @@ static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len) return sizeof(*ip6h); } -static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr) + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) { struct sockaddr_in6 *addr6 = (void *) sockaddr; struct sockaddr_in *addr4 = (void *) sockaddr; switch (domain) { case PF_INET: + memset(addr4, 0, sizeof(*addr4)); addr4->sin_family = AF_INET; addr4->sin_port = htons(cfg_port); - if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) error(1, 0, "ipv4 parse error: %s", str_addr); break; case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); addr6->sin6_family = AF_INET6; addr6->sin6_port = htons(cfg_port); - if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) error(1, 0, "ipv6 parse error: %s", str_addr); break; default: @@ -603,6 +609,7 @@ static void parse_opts(int argc, char **argv) sizeof(struct tcphdr) - 40 /* max tcp options */; int c; + char *daddr = NULL, *saddr = NULL; cfg_payload_len = max_payload_len; @@ -627,7 +634,7 @@ static void parse_opts(int argc, char **argv) cfg_cpu = strtol(optarg, NULL, 0); break; case 'D': - setup_sockaddr(cfg_family, optarg, _dst_addr); + daddr = optarg; break; case 'i': cfg_ifindex = if_nametoindex(optarg); @@ -638,7 +645,7 @@ static void parse_opts(int argc, char **argv) cfg_cork_mixed = true; break; case 'p': - cfg_port = htons(strtoul(optarg, NULL, 0)); + cfg_port = strtoul(optarg, NULL, 0); break; case 'r': cfg_rx = true; @@ -647,7 +654,7 @@ static void parse_opts(int argc, char **argv) cfg_payload_len = strtoul(optarg, NULL, 0); break; case 'S': - setup_sockaddr(cfg_family, optarg, _src_addr); + saddr = optarg; break; case 't': cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; @@ -660,6 +667,8 @@ static void parse_opts(int argc, char **argv) break; } } + setup_sockaddr(cfg_family, daddr, _dst_addr); + setup_sockaddr(cfg_family, saddr, _src_addr); if (cfg_payload_len > max_payload_len) error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); -- 1.7.1
Re: [PATCH v5 iproute2 net-next] erspan: add erspan version II support
On 12/20/17 4:49 PM, William Tu wrote: > The patch adds support for configuring the erspan v2, for both > ipv4 and ipv6 erspan implementation. Three additional fields > are added: 'erspan_ver' for distinguishing v1 or v2, 'erspan_dir' > for specifying direction of the mirrored traffic, and 'erspan_hwid' > for users to set ERSPAN engine ID within a system. > > As for manpage, the ERSPAN descriptions used to be under GRE, IPIP, > SIT Type paragraph. Since IP6GRE/IP6GRETAP also supports ERSPAN, > the patch removes the old one, creates a separate ERSPAN paragrah, > and adds an example. > > Signed-off-by: William Tu> --- > change in v5: > - update the "Usage: " in c file, so > # ip link help erspan > shows the new options Since v4 was already applied to the new iproute2 net-next tree, please send an update against it. Thanks,
Re: [PATCH v4 iproute2 net-next] erspan: add erspan version II support
On 12/19/17 8:01 PM, William Tu wrote: > The patch adds support for configuring the erspan v2, for both > ipv4 and ipv6 erspan implementation. Three additional fields > are added: 'erspan_ver' for distinguishing v1 or v2, 'erspan_dir' > for specifying direction of the mirrored traffic, and 'erspan_hwid' > for users to set ERSPAN engine ID within a system. > > As for manpage, the ERSPAN descriptions used to be under GRE, IPIP, > SIT Type paragraph. Since IP6GRE/IP6GRETAP also supports ERSPAN, > the patch removes the old one, creates a separate ERSPAN paragrah, > and adds an example. > > Signed-off-by: William Tu> --- > change in v4: > - use matches instead of strcmp on ingress/egress > change in v3: > - change erspan_dir 0/1 to "in[gress]/e[gress]" > - update manpage > change in v2: > - fix typo ETH_P_ERSPAN2 > - fix space and indent > --- > include/uapi/linux/if_ether.h | 1 + > include/uapi/linux/if_tunnel.h | 3 ++ > ip/link_gre.c | 66 -- > ip/link_gre6.c | 67 -- > man/man8/ip-link.8.in | 92 > -- > 5 files changed, 210 insertions(+), 19 deletions(-) This was applied to iproute2 net-next: https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/
Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization
On (12/25/17 15:49), Willem de Bruijn wrote: > > It would be good to also address the other instance of this at the > same time: protocol family (-4 or -6) has to be set before a call to > setup_sockaddr. Unlike this case, it hits an error() if called in the > "incorrect" order, but that is still a crutch. Yeah, I thought of that one too, but "usually" (at least that's what is instinctive to me) you bind to INADDR_ANY or ::, so this only becomes an issue on the client side. (And there, most people put the -4 or -6 before the address, but I agree it would be nice to fix this..) > But even better will be to save cfg_port, and src + dst addr optargs > as local variables, then call the init function only after parsing when > all state is available. Where this patch adds calls to > init_sockaddr_port, indeed. sure, I can put that out on V2 later this week.
[PATCH bpf-next 3/3] libbpf: break loop earlier
Get out of the loop when we have a match. Signed-off-by: Eric Leblond--- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5fe8aaa2123e..d263748aa341 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -412,6 +412,7 @@ bpf_object__init_prog_names(struct bpf_object *obj) prog->section_name); return -LIBBPF_ERRNO__LIBELF; } + break; } if (!name) { -- 2.15.1
[PATCH bpf-next 0/3] add XDP loading support to libbpf
Hello, This patchset adds a function to load XDP eBPF file in the libbpf library. It gets the netlink extended ack from the driver in case of failure and print the error to stderr. Best regards, -- Eric Leblond
[PATCH bpf-next 2/3] libbpf: add error reporting in XDP
Parse netlink ext attribute to get the error message returned by the card. Signed-off-by: Eric Leblond--- tools/lib/bpf/Build| 2 +- tools/lib/bpf/bpf.c| 9 +++ tools/lib/bpf/nlattr.c | 188 + tools/lib/bpf/nlattr.h | 164 ++ 4 files changed, 362 insertions(+), 1 deletion(-) create mode 100644 tools/lib/bpf/nlattr.c create mode 100644 tools/lib/bpf/nlattr.h diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index d8749756352d..64c679d67109 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o +libbpf-y := libbpf.o bpf.o nlattr.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 1e3cfe6b9fce..cfd30a0cbce4 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -26,6 +26,7 @@ #include #include "bpf.h" #include "libbpf.h" +#include "nlattr.h" #include #include #include @@ -436,6 +437,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) struct nlmsghdr *nh; struct nlmsgerr *err; socklen_t addrlen; + int one; memset(, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; @@ -445,6 +447,12 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) return -errno; } + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + , sizeof(one)) < 0) { + /* debug/verbose message that it is not supported */ + fprintf(stderr, "Netlink error reporting not supported\n"); + } + if (bind(sock, (struct sockaddr *), sizeof(sa)) < 0) { ret = -errno; goto cleanup; @@ -521,6 +529,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) if (!err->error) continue; ret = err->error; + nla_dump_errormsg(nh); goto cleanup; case NLMSG_DONE: break; diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c new file mode 100644 index ..962de14f74e3 --- /dev/null +++ b/tools/lib/bpf/nlattr.c @@ -0,0 +1,188 @@ + +/* + * NETLINK Netlink attributes + * + * Authors:Thomas Graf + * Alexey Kuznetsov + */ + +#include +#include "nlattr.h" +#include +#include +#include + +static const __u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { + [NLA_U8]= sizeof(__u8), + [NLA_U16] = sizeof(__u16), + [NLA_U32] = sizeof(__u32), + [NLA_U64] = sizeof(__u64), + [NLA_MSECS] = sizeof(__u64), + [NLA_NESTED]= NLA_HDRLEN, + [NLA_S8]= sizeof(__s8), + [NLA_S16] = sizeof(__s16), + [NLA_S32] = sizeof(__s32), + [NLA_S64] = sizeof(__s64), +}; + +static int validate_nla(const struct nlattr *nla, int maxtype, + const struct nla_policy *policy) +{ + const struct nla_policy *pt; + int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); + + if (type <= 0 || type > maxtype) + return 0; + + pt = [type]; + + if (pt->type > NLA_TYPE_MAX) + return -EINVAL; + + switch (pt->type) { + case NLA_FLAG: + if (attrlen > 0) + return -ERANGE; + break; + + case NLA_NUL_STRING: + if (pt->len) + minlen = min(attrlen, pt->len + 1); + else + minlen = attrlen; + + if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) + return -EINVAL; + /* fall through */ + + case NLA_STRING: + if (attrlen < 1) + return -ERANGE; + + if (pt->len) { + char *buf = nla_data(nla); + + if (buf[attrlen - 1] == '\0') + attrlen--; + + if (attrlen > pt->len) + return -ERANGE; + } + break; + + case NLA_BINARY: + if (pt->len && attrlen > pt->len) + return -ERANGE; + break; + + case NLA_NESTED_COMPAT: + if (attrlen < pt->len) + return -ERANGE; + if (attrlen < NLA_ALIGN(pt->len)) + break; + if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN) + return -ERANGE; + nla = nla_data(nla) + NLA_ALIGN(pt->len); + if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN + nla_len(nla)) + return -ERANGE; + break; + case NLA_NESTED: + /* a nested attributes is allowed to be
[PATCH bpf-next 1/3] libbpf: add function to setup XDP
Most of the code is taken from set_link_xdp_fd() in bpf_load.c and slightly modified to be library compliant. Signed-off-by: Eric Leblond--- tools/lib/bpf/bpf.c| 126 - tools/lib/bpf/libbpf.c | 2 + tools/lib/bpf/libbpf.h | 4 ++ 3 files changed, 130 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5128677e4117..1e3cfe6b9fce 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -25,6 +25,16 @@ #include #include #include "bpf.h" +#include "libbpf.h" +#include +#include +#include + +#ifndef IFLA_XDP_MAX +#define IFLA_XDP 43 +#define IFLA_XDP_FD1 +#define IFLA_XDP_FLAGS 3 +#endif /* * When building perf, unistd.h is overridden. __NR_bpf is @@ -46,8 +56,6 @@ # endif #endif -#define min(x, y) ((x) < (y) ? (x) : (y)) - static inline __u64 ptr_to_u64(const void *ptr) { return (__u64) (unsigned long) ptr; @@ -413,3 +421,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) return err; } + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + socklen_t addrlen; + + memset(, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + return -errno; + } + + if (bind(sock, (struct sockaddr *), sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *), ) < 0) { + ret = errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = errno; + goto cleanup; + } + + memset(, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, , sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, , sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, , req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); +nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != sa.nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + goto cleanup; + case NLMSG_DONE: + break; + default: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e9c4b7cabcf2..5fe8aaa2123e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -106,6 +106,8 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { [ERRCODE_OFFSET(PROG2BIG)] = "Program too big",
[PATCH v2 bpf-next 3/3] bpf: fix max call depth check
fix off by one error in max call depth check and add a test Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov--- kernel/bpf/verifier.c | 4 ++-- tools/testing/selftests/bpf/test_verifier.c | 35 + 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 738e919efdf0..52ad60b3b8be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2126,9 +2126,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *caller, *callee; int i, subprog, target_insn; - if (state->curframe >= MAX_CALL_FRAMES) { + if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", - state->curframe); + state->curframe + 2); return -E2BIG; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b5a7a6c530dc..5d0a574ce270 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8885,6 +8885,41 @@ static struct bpf_test tests[] = { .errstr = "combined stack", }, { + "calls: stack depth check using three frames. test5", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */ + BPF_EXIT_INSN(), + /* A */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */ + BPF_EXIT_INSN(), + /* B */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */ + BPF_EXIT_INSN(), + /* C */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */ + BPF_EXIT_INSN(), + /* D */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */ + BPF_EXIT_INSN(), + /* E */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */ + BPF_EXIT_INSN(), + /* F */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */ + BPF_EXIT_INSN(), + /* G */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ + BPF_EXIT_INSN(), + /* H */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .errstr = "call stack", + .result = REJECT, + }, + { "calls: spill into caller stack frame", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), -- 2.9.5
[PATCH v2 bpf-next 0/3] bpf: two stack check fixes
Jann reported an issue with stack depth tracking. Fix it and add tests. Also fix off-by-one error in MAX_CALL_FRAMES check. This set is on top of Jann's "selftest for late caller stack size increase" test. Alexei Starovoitov (3): bpf: fix maximum stack depth tracking logic selftests/bpf: additional stack depth tests bpf: fix max call depth check include/linux/bpf_verifier.h| 1 + kernel/bpf/verifier.c | 86 +++ tools/testing/selftests/bpf/test_verifier.c | 156 3 files changed, 225 insertions(+), 18 deletions(-) -- 2.9.5
[PATCH v2 bpf-next 1/3] bpf: fix maximum stack depth tracking logic
Instead of computing max stack depth for current call chain during the main verifier pass track stack depth of each function independently and after do_check() is done do another pass over all instructions analyzing depth of all possible call stacks. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: Jann HornSigned-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c| 82 +++- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aaac589e490c..94a02ceb1246 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -194,6 +194,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifer_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; + /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82ae580440b8..738e919efdf0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1430,33 +1430,80 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; - struct bpf_verifier_state *cur = env->cur_state; - int i; + u16 stack = env->subprog_stack_depth[func->subprogno]; if (stack >= -off) return 0; /* update known max for given subprogram */ env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} - /* compute the total for current call chain */ - for (i = 0; i <= cur->curframe; i++) { - u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; - - /* round up to 32-bytes, since this is granularity -* of interpreter stack sizes -*/ - depth = round_up(depth, 32); - total += depth; - } +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; - if (total > MAX_BPF_STACK) { +process_func: + /* round up to 32-bytes, since this is granularity +* of interpreter stack size +*/ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", - cur->curframe, total); + frame + 1, depth); return -EACCES; } - return 0; +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' +* was reached. Doesn't matter whether it was JA or EXIT +*/ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; } static int get_callee_stack_depth(struct
[PATCH v2 bpf-next 2/3] selftests/bpf: additional stack depth tests
to test inner logic of stack depth tracking Signed-off-by: Alexei Starovoitov--- tools/testing/selftests/bpf/test_verifier.c | 121 1 file changed, 121 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 41dcc7dbba42..b5a7a6c530dc 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8764,6 +8764,127 @@ static struct bpf_test tests[] = { .result = REJECT, }, { + "calls: stack depth check using three frames. test1", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* A */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0), + BPF_EXIT_INSN(), + /* B */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + /* stack_main=32, stack_A=256, stack_B=64 +* and max(main+A, main+A+B) < 512 +*/ + .result = ACCEPT, + }, + { + "calls: stack depth check using three frames. test2", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* A */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0), + BPF_EXIT_INSN(), + /* B */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + /* stack_main=32, stack_A=64, stack_B=256 +* and max(main+A, main+A+B) < 512 +*/ + .result = ACCEPT, + }, + { + "calls: stack depth check using three frames. test3", + .insns = { + /* main */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 8), /* call B */ + BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 0, 1), + BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* A */ + BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 10, 1), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_B, BPF_REG_10, -224, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, -3), + /* B */ + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 1), + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -6), /* call A */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + /* stack_main=64, stack_A=224, stack_B=256 +* and max(main+A, main+A+B) > 512 +*/ + .errstr = "combined stack", + .result = REJECT, + }, + { + "calls: stack depth check using three frames. test4", + /* void main(void) { +* func1(0); +* func1(1); +* func2(1); +* } +* void func1(int alloc_or_recurse) { +* if (alloc_or_recurse) { +* frame_pointer[-300] = 1; +* } else { +* func2(alloc_or_recurse); +* } +* } +* void func2(int alloc_or_recurse) { +* if (alloc_or_recurse) { +* frame_pointer[-300] = 1; +* } +* } +*/ + .insns = { + /* main */ + BPF_MOV64_IMM(BPF_REG_1, 0), +
Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization
On Sun, Dec 24, 2017 at 12:23 PM, Sowmini Varadhanwrote: > If -S is not used in the command line, we should > be binding to *.. Similarly, cfg_port should be > used to connect to the remote host even if it is processed > after -D. Thus we need to make sure that the cfg_port in > cfg_src_addr and cfg_dst_addr are always initialized > after all other command line options are parsed. Thanks, Sowmini. Yes, input parsing as is is dependent on order, which is surprising and fragile. It would be good to also address the other instance of this at the same time: protocol family (-4 or -6) has to be set before a call to setup_sockaddr. Unlike this case, it hits an error() if called in the "incorrect" order, but that is still a crutch. The new init_sockaddr_port duplicates most of setup_sockaddr. More concise is to add a branch to that to skip inet_pton if !str_addr. But even better will be to save cfg_port, and src + dst addr optargs as local variables, then call the init function only after parsing when all state is available. Where this patch adds calls to init_sockaddr_port, indeed.
iproute2 net-next
David Ahern has agreed to take over managing the net-next branch of iproute2. The new location is: https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/ In the past, I have accepted new features into iproute2 master branch, but am changing the policy so that outside of the merge window (up until -rc1) new features will get put into net-next to get some more review and testing time. This means that things like the proposed batch streaming mode will go through net-next. Thanks Stephen Hemminger pgp7OgxiQatY9.pgp Description: OpenPGP digital signature
Fw: [Bug 198253] New: stopping htb results in null pointer dereference
Begin forwarded message: Date: Mon, 25 Dec 2017 08:25:44 + From: bugzilla-dae...@bugzilla.kernel.org To: step...@networkplumber.org Subject: [Bug 198253] New: stopping htb results in null pointer dereference https://bugzilla.kernel.org/show_bug.cgi?id=198253 Bug ID: 198253 Summary: stopping htb results in null pointer dereference Product: Networking Version: 2.5 Kernel Version: 4.15 Hardware: All OS: Linux Tree: Mainline Status: NEW Severity: low Priority: P1 Component: Other Assignee: step...@networkplumber.org Reporter: lorinc.czegl...@gmail.com Regression: No Created attachment 261323 --> https://bugzilla.kernel.org/attachment.cgi?id=261323=edit BUG: unable to handle kernel NULL pointer dereference at 0018 the latest kernel linux415-4.15.r17121 as built by the manjaro team, using fireqos traffic shapeing frontend with attached config produces BUG: unable to handle kernel NULL pointer dereference at 0018 -- You are receiving this mail because: You are the assignee for the bug.
Re: [PATCH net-next] virtio_net: Add ethtool stats
On December 25, 2017 3:45:36 AM PST, Toshiaki Makitawrote: >On 2017/12/25 3:16, Stephen Hemminger wrote: >> On Wed, 20 Dec 2017 13:40:37 +0900 >> Toshiaki Makita wrote: >> >>> + >>> +static const struct virtnet_gstats virtnet_gstrings_stats[] = { >>> + { "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) }, >>> + { "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) }, >>> + { "rx_bytes", VIRTNET_NETDEV_STAT(rx_bytes) }, >>> + { "tx_bytes", VIRTNET_NETDEV_STAT(tx_bytes) }, >>> + { "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) }, >>> + { "rx_length_errors", VIRTNET_NETDEV_STAT(rx_length_errors) }, >>> + { "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) }, >>> + { "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) }, >>> + { "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) }, >>> +}; >>> + >> >> Please do not merge pre-existing global stats into ethtool. >> It just duplicates existing functionality. > >Thanks for the feedback. >I thought it's handy to be able to check stats like this in ethtool as >well. Some drivers like ixgbe and mlx5 do similar thing. >I can remove these duplicate stats (unless anyone has objection). For existing drivers, ethtool stats are kind of user ABI, so once these stats get in, we should not consider removing them. For new drivers or existing drivers without ethtool stats, we can do things a bit better as Stephen suggested. -- Florian
Re: [net 02/14] Revert "mlx5: move affinity hints assignments to generic code"
Before the offending commit, mlx5 core did the IRQ affinity itself, and it seems that the new generic code have some drawbacks and one of them is the lack for user ability to modify irq affinity after the initial affinity values got assigned. The issue is still being discussed and a solution in the new generic code is required, until then we need to revert this patch. This fixes the following issue: echo > /proc/irq//smp_affinity fails with -EIO This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664. Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since > it is used in mlx5_ib driver. This won't work for sure because the msi_desc affinity cpumask won't ever be populated. You need to re-implement it in mlx5 if you don't want to break rdma ULPs that rely on it.
Re: [PATCH] flex_can: Fix checking can_dlc
Answering myself after reading my own comment once more: In fact the code fix seems to be correct but the commit comment was completely wrong which lead to my answer ... can_dlc can hold values from 0 .. 8. The first 4 bytes are placed in data[0..3]. When we have more(!) than 4 bytes in can_dlc, the bytes 5..8 are placed in data[4..7]. The good thing was, that the current check was more conservative than your suggestion so that we did not detect any data loss. Please send a V2 patch with corrected commit message. Thanks, Oliver ps. From: "phu.luuan"+ * Copyright 2017 NXP A one-liner contribution like this tiny fix usually does not lead to an attribution in the copyrights. Your contribution is already perfectly recorded by the Signed-off-by tag.
[RFC PATCH v11 3/5] mwifiex: Disable wakeup irq handling for pcie
We are going to handle the wakeup irq in the pci core. Signed-off-by: Jeffy Chen--- Changes in v11: None Changes in v10: None Changes in v9: None Changes in v8: None Changes in v7: None Changes in v6: None Changes in v5: None Changes in v3: None Changes in v2: None drivers/net/wireless/marvell/mwifiex/main.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index a96bd7e653bf..3cc3403b977a 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter *adapter) goto err_exit; adapter->dt_node = dev->of_node; + + if (adapter->iface_type != MWIFIEX_PCIE) + goto err_exit; + adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0); if (!adapter->irq_wakeup) { dev_dbg(dev, "fail to parse irq_wakeup from device tree\n"); -- 2.11.0
[RFC PATCH v11 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core
Currently we are handling wake irq in mrvl wifi driver. Move it into pci core. Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi). Changes in v11: Only add irq definitions for PCI devices and rewrite the commit message. Address Brian's comments. Only support 1-per-device PCIe WAKE# pin as suggested. Move to pcie port as Brian suggested. Changes in v10: Use device_set_wakeup_capable() instead of device_set_wakeup_enable(), since dedicated wakeirq will be lost in device_set_wakeup_enable(false). Changes in v9: Add section for PCI devices and rewrite the commit message. Fix check error in .cleanup(). Move dedicated wakeirq setup to setup() callback and use device_set_wakeup_enable() to enable/disable. Rewrite the commit message. Changes in v8: Add optional "pci", and rewrite commit message. Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal. Rewrite the commit message. Changes in v7: Move PCIE_WAKE handling into pci core. Changes in v6: Fix device_init_wake error handling, and add some comments. Changes in v5: Move to pci.txt Rebase. Use "wakeup" instead of "wake" Changes in v3: Fix error handling. Changes in v2: Use dev_pm_set_dedicated_wake_irq. Jeffy Chen (5): dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq of/irq: Adjust of_pci_irq parsing for multiple interrupts mwifiex: Disable wakeup irq handling for pcie PCI / PM: Add support for the PCIe WAKE# signal for OF arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru Documentation/devicetree/bindings/pci/pci.txt | 10 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 11 +++-- drivers/net/wireless/marvell/mwifiex/main.c | 4 ++ drivers/of/of_pci_irq.c | 71 +-- drivers/pci/pci-driver.c | 10 include/linux/of_pci.h| 9 6 files changed, 107 insertions(+), 8 deletions(-) -- 2.11.0
Re: [PATCH net-next] virtio_net: Add ethtool stats
On 2017/12/25 3:16, Stephen Hemminger wrote: > On Wed, 20 Dec 2017 13:40:37 +0900 > Toshiaki Makitawrote: > >> + >> +static const struct virtnet_gstats virtnet_gstrings_stats[] = { >> +{ "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) }, >> +{ "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) }, >> +{ "rx_bytes", VIRTNET_NETDEV_STAT(rx_bytes) }, >> +{ "tx_bytes", VIRTNET_NETDEV_STAT(tx_bytes) }, >> +{ "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) }, >> +{ "rx_length_errors", VIRTNET_NETDEV_STAT(rx_length_errors) }, >> +{ "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) }, >> +{ "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) }, >> +{ "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) }, >> +}; >> + > > Please do not merge pre-existing global stats into ethtool. > It just duplicates existing functionality. Thanks for the feedback. I thought it's handy to be able to check stats like this in ethtool as well. Some drivers like ixgbe and mlx5 do similar thing. I can remove these duplicate stats (unless anyone has objection). -- Toshiaki Makita
Re: [PATCH] flex_can: Fix checking can_dlc
This patch looks wrong to me. On 12/19/2017 09:40 AM, Luu An Phu wrote: From: "phu.luuan"flexcan_start_xmit should write data to register when can_dlc > 4. Because can_dlc is data length and it has value from 1 to 8. No. can_dlc can contain values from 0 to 8. Even 0 is a valid DLC. But CAN data index has value from 0 to 7. So in case we have data in cf->data[4], the can_dlc has value is more than 4. Signed-off-by: Luu An Phu --- drivers/net/can/flexcan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 0626dcf..0e3ff13 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -5,6 +5,7 @@ * Copyright (c) 2009 Sascha Hauer, Pengutronix * Copyright (c) 2010-2017 Pengutronix, Marc Kleine-Budde * Copyright (c) 2014 David Jander, Protonic Holland + * Copyright 2017 NXP * * Based on code originally by Andrey Volkov * @@ -526,7 +527,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev) data = be32_to_cpup((__be32 *)>data[0]); flexcan_write(data, >tx_mb->data[0]); } - if (cf->can_dlc > 3) { This is correct as data[0 .. 3] are filled from the code above. And if can_dlc is greater than 3 (== 4 .. 8) the following 4 bytes are copied into the registers. So everything is correct with the current code. + if (cf->can_dlc > 4) { data = be32_to_cpup((__be32 *)>data[4]); flexcan_write(data, >tx_mb->data[1]); } Regards, Oliver
Re: [patch net-next 02/10] devlink: Add support for resource abstraction
On 12/20/2017 09:43 PM, David Miller wrote: > From: Jiri Pirko> Date: Wed, 20 Dec 2017 12:58:13 +0100 > >> From: Arkadi Sharshevsky >> >> Add support for hardware resource abstraction over devlink. Each resource >> is identified via id, furthermore it contains information regarding its >> size and its related sub resources. Each resource can also provide its >> current occupancy. >> >> In some cases the sizes of some resources can be changed, yet for those >> changes to take place a hot driver reload may be needed. The reload >> capability will be introduced in the next patch. >> >> Signed-off-by: Arkadi Sharshevsky >> Signed-off-by: Jiri Pirko > > In what units are these sizes? If it depends upon the resource, it would > be great to have a way to introspect the units given a resource. > This is problematic. Currently the units are actually double words (single entry is 64 bit) because this resource is a actually a memory. So my first thought was adding an enum in UAPI of resource_units enum resource_units { DEVLINK_RESOURCE_UNITS_WORD, DEVLINK_RESOURCE_UNITS_DOUBLE_WORD, DEVLINK_RESOURCE_UNITS_ITEM, /* this is in order to define some driver specific stuff*/ ... }; But the 'item' is too vague, because for example, we will have the RIF bank as resource. What unit will it have? rifs? items? Any inputs on this? >> +struct devlink_resource_ops *resource_ops; > > Const? > >> +static inline int >> +devlink_resource_register(struct devlink *devlink, >> + const char *resource_name, >> + bool top_hierarchy, >> + u64 resource_size, >> + u64 resource_id, >> + u64 parent_resource_id, >> + struct devlink_resource_ops *resource_ops) > > Const for resource_ops? > >> +int devlink_resource_register(struct devlink *devlink, >> + const char *resource_name, >> + bool top_hierarchy, >> + u64 resource_size, >> + u64 resource_id, >> + u64 parent_resource_id, >> + struct devlink_resource_ops *resource_ops) > > Likewise. >
Re: [patch net-next v4 00/10] net: sched: allow qdiscs to share filter block instances
Sun, Dec 24, 2017 at 05:25:41PM CET, dsah...@gmail.com wrote: >On 12/24/17 1:19 AM, Jiri Pirko wrote: >> Sun, Dec 24, 2017 at 02:54:47AM CET, dsah...@gmail.com wrote: >>> On 12/23/17 9:54 AM, Jiri Pirko wrote: So back to the example. First, we create 2 qdiscs. Both will share block number 22. "22" is just an identification. If we don't pass any block number, a new one will be generated by kernel: $ tc qdisc add dev ens7 ingress block 22 $ tc qdisc add dev ens8 ingress block 22 Now if we list the qdiscs, we will see the block index in the output: $ tc qdisc qdisc ingress : dev ens7 parent :fff1 block 22 qdisc ingress : dev ens8 parent :fff1 block 22 To make is more visual, the situation looks like this: ens7 ingress qdisc ens7 ingress qdisc | | | | +--> block 22 <--+ Unlimited number of qdiscs may share the same block. Now we can add filter to any of qdiscs sharing the same block: $ tc filter add dev ens7 ingress protocol ip pref 25 flower dst_ip 192.168.0.0/16 action drop >>> >>> >>> Allowing config of a shared block through any qdisc that references it >>> is akin to me allowing nexthop objects to be manipulated by any route >>> that references it -- sure, it could be done but causes a lot surprises >>> to the user. >>> >>> You are adding a new tc object -- a shared block. Why the resistance to >>> creating a proper API for managing it? >> >> Again, no resistance, I said many times it would be done as a follow-up. >> But as an api already exists, it has to continue to work. Or do you >> suggest it should stop working? That, I don't agree with. >> > >That is exactly what I am saying - principle of least surprise. The new >object brings its own API and can only be modified using the new API. >The scheme above can and will surprise users. You are thinking like a tc >developer, someone intimately familiar with the code, and not like an >ordinary user of this new feature. Breaking exising tools is newer good. Note that not only about filter add/del iface but also dump and notifications. I agree to extend the api for the "block handle", sure, but the existing api should continue to work.
Re: [QUESTION] Doubt about NAPI_GRO_CB(skb)->is_atomic in tcpv4 gro process
Hi, Alexander On 2017/12/23 0:32, Alexander Duyck wrote: > On Fri, Dec 22, 2017 at 12:49 AM, Yunsheng Linwrote: >> Hi, Alexander >> >> On 2017/12/22 0:29, Alexander Duyck wrote: >>> On Thu, Dec 21, 2017 at 1:16 AM, Yunsheng Lin >>> wrote: Hi, Alexander On 2017/12/21 0:24, Alexander Duyck wrote: > On Wed, Dec 20, 2017 at 1:09 AM, Yunsheng Lin > wrote: >> Hi, all >> I have some doubt about NAPI_GRO_CB(skb)->is_atomic when >> analyzing the tcpv4 gro process: >> >> Firstly we set NAPI_GRO_CB(skb)->is_atomic to 1 in dev_gro_receive: >> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/core/dev.c#L4838 >> >> And then in inet_gro_receive, we check the NAPI_GRO_CB(skb)->is_atomic >> before setting NAPI_GRO_CB(skb)->is_atomic according to IP_DF bit in the >> ip header: >> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/ipv4/af_inet.c#L1319 >> >> struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff >> *skb) >> { >> . >> for (p = *head; p; p = p->next) { >> >> >> /* If the previous IP ID value was based on an atomic >> * datagram we can overwrite the value and ignore it. >> */ >> if (NAPI_GRO_CB(skb)->is_atomic) >> //we check it here >> NAPI_GRO_CB(p)->flush_id = flush_id; >> else >> NAPI_GRO_CB(p)->flush_id |= flush_id; >> } >> >> NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); >> //we set it here >> NAPI_GRO_CB(skb)->flush |= flush; >> skb_set_network_header(skb, off); >> >> } >> >> My question is whether we should check the NAPI_GRO_CB(skb)->is_atomic >> or NAPI_GRO_CB(p)->is_atomic? >> If we should check NAPI_GRO_CB(skb)->is_atomic, then maybe it is >> unnecessary because it is alway true. >> If we should check NAPI_GRO_CB(p)->is_atomic, maybe there is a bug here. >> >> So what is the logic here? I am just start analyzing the gro, maybe I >> miss something obvious here. > > The logic there is to address the multiple IP header case where there > are 2 or more IP headers due to things like VXLAN or GRE tunnels. So > what will happen is that an outer IP header will end up being sent > with DF not set and will clear the is_atomic value then we want to OR > in the next header that is applied. It defaults to assignment on > is_atomic because the first IP header will encounter flush_id with no > previous configuration occupying it. I see your point now. But for the same flow of tunnels packet, the outer and inner ip header must have the same fixed id or increment id? For example, if we have a flow of tunnels packet which has fixed id in outer header and increment id in inner header(the inner header does have DF flag set): >> >> Sorry, a typo error here. I meant the inner header does *not* have DF flag >> set here. >> 1. For the first packet, NAPI_GRO_CB(skb)->is_atomic will be set to zero when inet_gro_receive is processing the inner ip header. 2. For the second packet, when inet_gro_receive is processing the outer ip header which has a fixed id, NAPI_GRO_CB(p)->is_atomic is zero according to [1], so NAPI_GRO_CB(p)->flush_id will be set to 0x, then the second packet will not be merged to first packet in tcp_gro_receive. >>> >>> I'm not sure how valid your case here is. The is_atomic is only really >>> meant to apply to the inner-most header. >> >> For the new skb, NAPI_GRO_CB(skb)->is_atomic is indeed applied to the >> inner-most header. >> >> What about the NAPI_GRO_CB(p)->is_atomic, p is the same skb flow already >> merged by gro. >> >> Let me try if I understand it correctly: >> when there is only one skb merged in p, then NAPI_GRO_CB(p)->is_atomic >> is set according to the first skb' inner-most ip DF flags. >> >> When the second skb comes, and inet_gro_receive is processing the >> outer-most ip, for the below code, NAPI_GRO_CB(p)->is_atomic >> is for first skb's inner-most ip DF flags, and "iph->frag_off & htons(IP_DF)" >> is for second skb' outer-most ip DF flags. >> Why don't we use the first skb's outer-most ip DF flags here? I think it is >> more logical to use first skb's outer-most ip DF flags here. But the first >> skb's outer-most ip DF flags is lost when we get here, right? >> >> if (!NAPI_GRO_CB(p)->is_atomic || >> !(iph->frag_off & htons(IP_DF))) { >> flush_id ^=
[patch net-next 0/2] net: sched: Fix RED qdisc offload flag
Replacing the RED offload flag (TC_RED_OFFLOADED) with the generic one (TCQ_F_OFFLOADED) deleted some of the logic behind it. This patchset fixes this problem. Nogah Frankel (2): net_sch: red: Fix the new offload indication net: sched: Move offload check till after dump call net/sched/sch_api.c | 5 ++--- net/sched/sch_red.c | 26 ++ 2 files changed, 16 insertions(+), 15 deletions(-) -- 2.4.3
[patch net-next 2/2] net: sched: Move offload check till after dump call
Move the check of the offload state to after the qdisc dump action was called, so the qdisc could update it if it was changed. Fixes: 7a4fa29106d9 ("net: sched: Add TCA_HW_OFFLOAD") Signed-off-by: Nogah FrankelReviewed-by: Yuval Mintz --- net/sched/sch_api.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3a3a1da..758f132 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -807,11 +807,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(>refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; - if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) - goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - + if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) + goto nla_put_failure; qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); -- 2.4.3
[patch net-next 1/2] net_sch: red: Fix the new offload indication
Update the offload flag, TCQ_F_OFFLOADED, in each dump call (and ignore the offloading function return value in relation to this flag). This is done because a qdisc is being initialized, and therefore offloaded before being grafted. Since the ability of the driver to offload the qdisc depends on its location, a qdisc can be offloaded and un-offloaded by graft calls, that doesn't effect the qdisc itself. Fixes: 428a68af3a7c ("net: sched: Move to new offload indication in RED" Signed-off-by: Nogah FrankelReviewed-by: Yuval Mintz --- net/sched/sch_red.c | 26 ++ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index ec0bd36..a392eaa 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -157,7 +157,6 @@ static int red_offload(struct Qdisc *sch, bool enable) .handle = sch->handle, .parent = sch->parent, }; - int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -172,14 +171,7 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.command = TC_RED_DESTROY; } - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, ); - - if (!err && enable) - sch->flags |= TCQ_F_OFFLOADED; - else - sch->flags &= ~TCQ_F_OFFLOADED; - - return err; + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, ); } static void red_destroy(struct Qdisc *sch) @@ -297,12 +289,22 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = >qstats, }, }; + int err; + + sch->flags &= ~TCQ_F_OFFLOADED; - if (!(sch->flags & TCQ_F_OFFLOADED)) + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + _stats); + if (err == -EOPNOTSUPP) return 0; - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, -_stats); + if (!err) + sch->flags |= TCQ_F_OFFLOADED; + + return err; } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) -- 2.4.3
[patch iproute2 v3 1/4] lib/libnetlink: Add a function rtnl_talk_msg
rtnl_talk can only send a single message to kernel. Add a new function rtnl_talk_msg that can send multiple messages to kernel. Signed-off-by: Chris Mi--- include/libnetlink.h | 3 ++ lib/libnetlink.c | 92 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index a4d83b9e..e95fad75 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth, int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer) __attribute__((warn_unused_result)); +int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m, + struct nlmsghdr **answer) + __attribute__((warn_unused_result)); int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer, nl_ext_ack_fn_t errfn) __attribute__((warn_unused_result)); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 00e6ce0c..f5f675cf 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -581,36 +581,21 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err, strerror(-err->error)); } -static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, - struct nlmsghdr **answer, - bool show_rtnl_err, nl_ext_ack_fn_t errfn) +static int __rtnl_check_ack(struct rtnl_handle *rtnl, struct nlmsghdr **answer, + bool show_rtnl_err, nl_ext_ack_fn_t errfn, + unsigned int seq) { int status; - unsigned int seq; - struct nlmsghdr *h; + char *buf; struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; - struct iovec iov = { - .iov_base = n, - .iov_len = n->nlmsg_len - }; + struct nlmsghdr *h; + struct iovec iov; struct msghdr msg = { .msg_name = , .msg_namelen = sizeof(nladdr), .msg_iov = , .msg_iovlen = 1, }; - char *buf; - - n->nlmsg_seq = seq = ++rtnl->seq; - - if (answer == NULL) - n->nlmsg_flags |= NLM_F_ACK; - - status = sendmsg(rtnl->fd, , 0); - if (status < 0) { - perror("Cannot talk to rtnetlink"); - return -1; - } while (1) { status = rtnl_recvmsg(rtnl->fd, , ); @@ -698,12 +683,77 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, } } +static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, + struct nlmsghdr **answer, + bool show_rtnl_err, nl_ext_ack_fn_t errfn) +{ + unsigned int seq; + int status; + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + struct iovec iov = { + .iov_base = n, + .iov_len = n->nlmsg_len + }; + struct msghdr msg = { + .msg_name = , + .msg_namelen = sizeof(nladdr), + .msg_iov = , + .msg_iovlen = 1, + }; + + n->nlmsg_seq = seq = ++rtnl->seq; + + if (answer == NULL) + n->nlmsg_flags |= NLM_F_ACK; + + status = sendmsg(rtnl->fd, , 0); + if (status < 0) { + perror("Cannot talk to rtnetlink"); + return -1; + } + + return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq); +} + +static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m, + struct nlmsghdr **answer, + bool show_rtnl_err, nl_ext_ack_fn_t errfn) +{ + int iovlen = m->msg_iovlen; + unsigned int seq = 0; + struct nlmsghdr *n; + struct iovec *v; + int i, status; + + for (i = 0; i < iovlen; i++) { + v = >msg_iov[i]; + n = v->iov_base; + n->nlmsg_seq = seq = ++rtnl->seq; + if (answer == NULL) + n->nlmsg_flags |= NLM_F_ACK; + } + + status = sendmsg(rtnl->fd, m, 0); + if (status < 0) { + perror("Cannot talk to rtnetlink"); + return -1; + } + + return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq); +} + int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer) { return __rtnl_talk(rtnl, n, answer, true, NULL); } +int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m, + struct nlmsghdr **answer) +{ + return __rtnl_talk_msg(rtnl, m, answer, true, NULL); +} + int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr **answer, nl_ext_ack_fn_t errfn) -- 2.14.3
[patch iproute2 v3 3/4] tc: Add -bs option to batch mode
Signed-off-by: Chris Mi--- tc/m_action.c | 91 +-- tc/tc.c| 47 ++ tc/tc_common.h | 8 +++- tc/tc_filter.c | 121 + 4 files changed, 204 insertions(+), 63 deletions(-) diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..c4c3b862 100644 --- a/tc/m_action.c +++ b/tc/m_action.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "utils.h" #include "tc_common.h" @@ -546,40 +547,88 @@ bad_val: return ret; } +typedef struct { + struct nlmsghdr n; + struct tcamsg t; + charbuf[MAX_MSG]; +} tc_action_req; + +static tc_action_req *action_reqs; +static struct iovec msg_iov[MSG_IOV_MAX]; + +void free_action_reqs(void) +{ + free(action_reqs); +} + +static tc_action_req *get_action_req(int batch_size, int index) +{ + tc_action_req *req; + + if (action_reqs == NULL) { + action_reqs = malloc(batch_size * sizeof (tc_action_req)); + if (action_reqs == NULL) + return NULL; + } + req = _reqs[index]; + memset(req, 0, sizeof (*req)); + + return req; +} + static int tc_action_modify(int cmd, unsigned int flags, - int *argc_p, char ***argv_p) + int *argc_p, char ***argv_p, + int batch_size, int index, bool send) { int argc = *argc_p; char **argv = *argv_p; int ret = 0; - struct { - struct nlmsghdr n; - struct tcamsg t; - charbuf[MAX_MSG]; - } req = { - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), - .n.nlmsg_flags = NLM_F_REQUEST | flags, - .n.nlmsg_type = cmd, - .t.tca_family = AF_UNSPEC, + tc_action_req *req; + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + struct iovec *iov = _iov[index]; + + req = get_action_req(batch_size, index); + if (req == NULL) { + fprintf(stderr, "get_action_req error: not enough buffer\n"); + return -ENOMEM; + } + + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)); + req->n.nlmsg_flags = NLM_F_REQUEST | flags; + req->n.nlmsg_type = cmd; + req->t.tca_family = AF_UNSPEC; + struct rtattr *tail = NLMSG_TAIL(>n); + + struct msghdr msg = { + .msg_name = , + .msg_namelen = sizeof(nladdr), + .msg_iov = msg_iov, + .msg_iovlen = index + 1, }; - struct rtattr *tail = NLMSG_TAIL(); argc -= 1; argv += 1; - if (parse_action(, , TCA_ACT_TAB, )) { + if (parse_action(, , TCA_ACT_TAB, >n)) { fprintf(stderr, "Illegal \"action\"\n"); return -1; } - tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail; + tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail; + + *argc_p = argc; + *argv_p = argv; + + iov->iov_base = >n; + iov->iov_len = req->n.nlmsg_len; + + if (!send) + return 0; - if (rtnl_talk(, , NULL) < 0) { + ret = rtnl_talk_msg(, , NULL); + if (ret < 0) { fprintf(stderr, "We have an error talking to the kernel\n"); ret = -1; } - *argc_p = argc; - *argv_p = argv; - return ret; } @@ -679,7 +728,7 @@ bad_val: return ret; } -int do_action(int argc, char **argv) +int do_action(int argc, char **argv, int batch_size, int index, bool send) { int ret = 0; @@ -689,12 +738,14 @@ int do_action(int argc, char **argv) if (matches(*argv, "add") == 0) { ret = tc_action_modify(RTM_NEWACTION, NLM_F_EXCL | NLM_F_CREATE, - , ); + , , batch_size, + index, send); } else if (matches(*argv, "change") == 0 || matches(*argv, "replace") == 0) { ret = tc_action_modify(RTM_NEWACTION, NLM_F_CREATE | NLM_F_REPLACE, - , ); + , , batch_size, + index, send); } else if (matches(*argv, "delete") == 0) { argc -= 1; argv += 1; diff --git a/tc/tc.c b/tc/tc.c index ad9f07e9..7ea2fc89 100644 --- a/tc/tc.c +++ b/tc/tc.c @@ -189,20 +189,20 @@ static void usage(void) fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT {
[patch iproute2 v3 4/4] man: Add -bs option to tc manpage
Signed-off-by: Chris Mi--- man/man8/tc.8 | 5 + 1 file changed, 5 insertions(+) diff --git a/man/man8/tc.8 b/man/man8/tc.8 index ff071b33..de137e16 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -601,6 +601,11 @@ must exist already. read commands from provided file or standard input and invoke them. First failure will cause termination of tc. +.TP +.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size" +How many commands are accumulated before sending to kernel. +By default, it is 1. It only takes effect in batch mode. + .TP .BR "\-force" don't terminate tc on errors in batch mode. -- 2.14.3
[patch iproute2 v3 2/4] utils: Add a function setcmdlinetotal
This function calculates how many commands a batch file has and set it to global variable cmdlinetotal. Signed-off-by: Chris Mi--- include/utils.h | 4 lib/utils.c | 20 2 files changed, 24 insertions(+) diff --git a/include/utils.h b/include/utils.h index d3895d56..113a8c31 100644 --- a/include/utils.h +++ b/include/utils.h @@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n); extern int cmdlineno; ssize_t getcmdline(char **line, size_t *len, FILE *in); + +extern int cmdlinetotal; +void setcmdlinetotal(const char *name); + int makeargs(char *line, char *argv[], int maxargs); int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6); diff --git a/lib/utils.c b/lib/utils.c index 7ced8c06..53ca389f 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1202,6 +1202,26 @@ ssize_t getcmdline(char **linep, size_t *lenp, FILE *in) return cc; } +int cmdlinetotal; + +void setcmdlinetotal(const char *name) +{ + char *line = NULL; + size_t len = 0; + + if (name && strcmp(name, "-") != 0) { + if (freopen(name, "r", stdin) == NULL) { + fprintf(stderr, "Cannot open file \"%s\" for reading: %s\n", + name, strerror(errno)); + return; + } + } + + cmdlinetotal = 0; + while (getcmdline(, , stdin) != -1) + cmdlinetotal++; +} + /* split command line into argument vector */ int makeargs(char *line, char *argv[], int maxargs) { -- 2.14.3
[patch iproute2 v3 0/4] tc: Add -bs option to batch mode
Currently in tc batch mode, only one command is read from the batch file and sent to kernel to process. With this patchset, we can accumulate several commands before sending to kernel. The batch size is specified using option -bs or -batchsize. To accumulate the commands in tc, client should allocate an array of struct iovec. If batchsize is bigger than 1, only after the client has accumulated enough commands, can the client call rtnl_talk_msg to send the message that includes the iov array. One exception is that there is no more command in the batch file. But please note that kernel still processes the requests one by one. To process the requests in parallel in kernel is another effort. The time we're saving in this patchset is the user mode and kernel mode context switch. So this patchset works on top of the current kernel. Using the following script in kernel, we can generate 1,000,000 rules. tools/testing/selftests/tc-testing/tdc_batch.py Without this patchset, 'tc -b $file' exection time is: real0m15.125s user0m6.982s sys 0m8.080s With this patchset, 'tc -b $file -bs 10' exection time is: real0m12.772s user0m5.984s sys 0m6.723s The insertion rate is improved more than 10%. In this patchset, we still ack for every rule. If we don't ack at all, 'tc -b $file' exection time is: real0m14.484s user0m6.919s sys 0m7.498s 'tc -b $file -bs 10' exection time is: real0m11.664s user0m6.017s sys 0m5.578s We can see that the performance win is to send multiple messages instead of no acking. I think that's because in tc, we don't spend too much time processing the ack message. v3 == 1. Instead of hacking function rtnl_talk directly, add a new function rtnl_talk_msg. 2. remove most of global variables to use parameter passing 3. divide the previous patch into 4 patches. Chris Mi (4): lib/libnetlink: Add a function rtnl_talk_msg utils: Add a function setcmdlinetotal tc: Add -bs option to batch mode man: Add -bs option to tc manpage include/libnetlink.h | 3 ++ include/utils.h | 4 ++ lib/libnetlink.c | 92 ++- lib/utils.c | 20 + man/man8/tc.8| 5 +++ tc/m_action.c| 91 +- tc/tc.c | 47 tc/tc_common.h | 8 +++- tc/tc_filter.c | 121 +-- 9 files changed, 307 insertions(+), 84 deletions(-) -- 2.14.3
[patch net] mlxsw: spectrum: Relax sanity checks during enslavement
From: Ido SchimmelSince commit 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers") the driver forbids enslavement to netdevs that already have uppers of their own, as this can result in various ordering problems. This requirement proved to be too strict for some users who need to be able to enslave ports to a bridge that already has uppers. In this case, we can allow the enslavement if the bridge is already known to us, as any configuration performed on top of the bridge was already reflected to the device. Fixes: 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Tested-by: Alexander Petrovskiy Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 11 +-- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 ++ drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 ++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 9bd8d28..c3837ca 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4376,7 +4376,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) { + if (netdev_has_any_upper_dev(upper_dev) && + (!netif_is_bridge_master(upper_dev) || +!mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp, + upper_dev))) { NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; @@ -4504,6 +4507,7 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, u16 vid) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct netdev_notifier_changeupper_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *upper_dev; @@ -4520,7 +4524,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) { + if (netdev_has_any_upper_dev(upper_dev) && + (!netif_is_bridge_master(upper_dev) || +!mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp, + upper_dev))) { NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 432ab9b..05ce1be 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -365,6 +365,8 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port, void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *brport_dev, struct net_device *br_dev); +bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, +const struct net_device *br_dev); /* spectrum.c */ int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 7b8548e..593ad31 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -152,6 +152,12 @@ mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge, return NULL; } +bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, +const struct net_device *br_dev) +{ + return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); +} + static struct mlxsw_sp_bridge_device * mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge, struct net_device *br_dev) -- 2.9.5
Re: [patch net] mlxsw: spectrum_router: Fix NULL pointer deref
Mon, Dec 25, 2017 at 08:57:35AM CET, j...@resnulli.us wrote: >From: Ido Schimmel> >When we remove the neighbour associated with a nexthop we should always >refuse to write the nexthop to the adjacency table. Regardless if it is >already present in the table or not. > >Otherwise, we risk dereferencing the NULL pointer that was set instead >of the neighbour. > >Fixes: a7ff87acd995 ("mlxsw: spectrum_router: Implement next-hop routing") >Signed-off-by: Ido Schimmel >Reported-by: Alexander Petrovskiy >Signed-off-by: Jiri Pirko Dave, could you please queue this up for 4.14.y together with "mlxsw: spectrum: Relax sanity checks during enslavement". Thanks!