date:20171225

[RFC PATCH bpf-next v2 4/4] error-injection: Support fault injection framework

2017-12-25 Thread Masami Hiramatsu

Support in-kernel fault-injection framework via debugfs.
This allows you to inject a conditional error to specified
function using debugfs interfaces.

Signed-off-by: Masami Hiramatsu 
---
 Documentation/fault-injection/fault-injection.txt |5 +
 kernel/Makefile   |1 
 kernel/fail_function.c|  169 +
 lib/Kconfig.debug |   10 +
 4 files changed, 185 insertions(+)
 create mode 100644 kernel/fail_function.c

diff --git a/Documentation/fault-injection/fault-injection.txt 
b/Documentation/fault-injection/fault-injection.txt
index 918972babcd8..6243a588dd71 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -30,6 +30,11 @@ o fail_mmc_request
   injects MMC data errors on devices permitted by setting
   debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
 
+o fail_function
+
+  injects error return on specific functions by setting debugfs entries
+  under /sys/kernel/debug/fail_function. No boot option supported.
+
 Configure fault-injection capabilities behavior
 ---
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index ..203d3582487a
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+
+static DEFINE_MUTEX(fei_lock);
+static struct {
+   struct kprobe kp;
+   unsigned long retval;
+   struct fault_attr attr;
+} fei_attr = {
+   .kp = { .pre_handler = fei_kprobe_handler, },
+   .retval = ~0UL, /* This indicates -1 in long/int return value */
+   .attr = FAULT_ATTR_INITIALIZER,
+};
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+   if (should_fail(_attr.attr, 1)) {
+   regs_set_return_value(regs, fei_attr.retval);
+   override_function_with_return(regs);
+   /* Kprobe specific fixup */
+   reset_current_kprobe();
+   preempt_enable_no_resched();
+   return 1;
+   }
+
+   return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+   mutex_lock(_lock);
+   return *pos == 0 ? (void *)1 : NULL;
+}
+
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+   mutex_unlock(_lock);
+}
+
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+   return NULL;
+}
+
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+   if (fei_attr.kp.addr)
+   seq_printf(m, "%pf\n", fei_attr.kp.addr);
+   else
+   seq_puts(m, "# not specified\n");
+   return 0;
+}
+
+static const struct seq_operations fei_seq_ops = {
+   .start  = fei_seq_start,
+   .next   = fei_seq_next,
+   .stop   = fei_seq_stop,
+   .show   = fei_seq_show,
+};
+
+static int fei_open(struct inode *inode, struct file *file)
+{
+   return seq_open(file, _seq_ops);
+}
+
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+size_t count, loff_t *ppos)
+{
+   unsigned long addr;
+   char *buf, *sym;
+   int ret;
+
+   /* cut off if it is too long */
+   if (count > KSYM_NAME_LEN)
+   count = KSYM_NAME_LEN;
+   buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   if (copy_from_user(buf, buffer, count)) {
+   ret = -EFAULT;
+   goto out;
+   }
+   buf[count] = '\0';
+   sym = strstrip(buf);
+
+   if (strlen(sym) == 0 || sym[0] == '0') {
+   if (fei_attr.kp.addr) {
+   unregister_kprobe(_attr.kp);
+   fei_attr.kp.addr = NULL;
+   }
+   ret = count;
+   goto out;
+   }
+
+   addr = kallsyms_lookup_name(sym);
+   if (!addr) {
+   ret = -EINVAL;
+   goto out;
+   }
+   if (!within_error_injection_list(addr)) {
+   ret = -ERANGE;
+   goto out;
+   }
+
+   if (fei_attr.kp.addr) {
+

[RFC PATCH bpf-next v2 3/4] error-injection: Separate error-injection from kprobe

2017-12-25 Thread Masami Hiramatsu

Since error-injection framework is not limited to be used
by kprobes, nor bpf. Other kernel subsystems can use it
freely for checking safeness of error-injection, e.g.
livepatch, ftrace etc.
So this separate error-injection framework from kprobes.

Some differences has been made:

- "kprobe" word is removed from any APIs/structures.
- BPF_ALLOW_ERROR_INJECTION() is renamed to
  ALLOW_ERROR_INJECTION() since it is not limited for BPF too.
- CONFIG_FUNCTION_ERROR_INJECTION is the config item of this
  feature. It is automatically enabled if the arch supports
  error injection feature for kprobe or ftrace etc.

Signed-off-by: Masami Hiramatsu 
---
  Changes in v2:
   - Fix the override function name to override_function_with_return()
   - Show only function name in the list, user don't have to care about
 it's size, since function override only happens at the entry.
---
 arch/Kconfig   |2 
 arch/x86/Kconfig   |2 
 arch/x86/include/asm/error-injection.h |   12 ++
 arch/x86/kernel/kprobes/ftrace.c   |   14 --
 arch/x86/lib/Makefile  |2 
 arch/x86/lib/error-inject.c|   19 +++
 fs/btrfs/disk-io.c |2 
 fs/btrfs/free-space-cache.c|2 
 include/asm-generic/error-injection.h  |   20 +++
 include/asm-generic/vmlinux.lds.h  |   14 +-
 include/linux/bpf.h|   12 --
 include/linux/error-injection.h|   21 +++
 include/linux/kprobes.h|1 
 include/linux/module.h |6 -
 kernel/kprobes.c   |  163 --
 kernel/module.c|8 +
 kernel/trace/Kconfig   |2 
 kernel/trace/bpf_trace.c   |2 
 kernel/trace/trace_kprobe.c|3 
 lib/Kconfig.debug  |4 +
 lib/Makefile   |1 
 lib/error-inject.c |  198 
 22 files changed, 300 insertions(+), 210 deletions(-)
 create mode 100644 arch/x86/include/asm/error-injection.h
 create mode 100644 arch/x86/lib/error-inject.c
 create mode 100644 include/asm-generic/error-injection.h
 create mode 100644 include/linux/error-injection.h
 create mode 100644 lib/error-inject.c

diff --git a/arch/Kconfig b/arch/Kconfig
index d3f4aaf9cb7a..97376accfb14 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,7 +196,7 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
-config HAVE_KPROBE_OVERRIDE
+config HAVE_FUNCTION_ERROR_INJECTION
bool
 
 config HAVE_NMI
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 04d66e6fa447..fc519e3ae754 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,7 +154,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
-   select HAVE_KPROBE_OVERRIDE
+   select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/error-injection.h 
b/arch/x86/include/asm/error-injection.h
new file mode 100644
index ..6c2a133622f4
--- /dev/null
+++ b/arch/x86/include/asm/error-injection.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ERROR_INJECTION_H
+#define _ASM_ERROR_INJECTION_H
+
+#include 
+#include 
+#include 
+
+asmlinkage void just_return_func(void);
+void override_function_with_return(struct pt_regs *regs);
+
+#endif /* _ASM_ERROR_INJECTION_H */
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 1ea748d682fd..8dc0161cec8f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-   ".type override_func, @function\n"
-   "override_func:\n"
-   "   ret\n"
-   ".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-   regs->ip = (unsigned long)_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7b181b61170e..081f09435d28 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,6 +26,8 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
new file mode 100644
index ..7b881d03d0dd
--- /dev/null
+++ b/arch/x86/lib/error-inject.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier:

[RFC PATCH bpf-next v2 2/4] tracing/kprobe: bpf: Compare instruction pointer with original one

2017-12-25 Thread Masami Hiramatsu

Compare instruction pointer with original one on the
stack instead using per-cpu bpf_kprobe_override flag.

This patch also consolidates reset_current_kprobe() and
preempt_enable_no_resched() blocks. Those can be done
in one place.

Signed-off-by: Masami Hiramatsu 
---
 kernel/trace/bpf_trace.c|1 -
 kernel/trace/trace_kprobe.c |   21 +++--
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d663660f8392..cefa9b0e396c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
-   __this_cpu_write(bpf_kprobe_override, 1);
regs_set_return_value(regs, rc);
arch_ftrace_kprobe_override_function(regs);
return 0;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 265e3e27e8dc..a7c7035963f2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,8 +42,6 @@ struct trace_kprobe {
(offsetof(struct trace_kprobe, tp.args) +   \
(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
-
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
return tk->rp.handler != NULL;
@@ -1204,6 +1202,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs 
*regs)
int rctx;
 
if (bpf_prog_array_valid(call)) {
+   unsigned long orig_ip = instruction_pointer(regs);
int ret;
 
ret = trace_call_bpf(call, regs);
@@ -1211,12 +1210,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct 
pt_regs *regs)
/*
 * We need to check and see if we modified the pc of the
 * pt_regs, and if so clear the kprobe and return 1 so that we
-* don't do the instruction skipping.  Also reset our state so
-* we are clean the next pass through.
+* don't do the single stepping.
+* The ftrace kprobe handler leaves it up to us to re-enable
+* preemption here before returning if we've modified the ip.
 */
-   if (__this_cpu_read(bpf_kprobe_override)) {
-   __this_cpu_write(bpf_kprobe_override, 0);
+   if (orig_ip != instruction_pointer(regs)) {
reset_current_kprobe();
+   preempt_enable_no_resched();
return 1;
}
if (!ret)
@@ -1324,15 +1324,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct 
pt_regs *regs)
if (tk->tp.flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
-   if (tk->tp.flags & TP_FLAG_PROFILE) {
+   if (tk->tp.flags & TP_FLAG_PROFILE)
ret = kprobe_perf_func(tk, regs);
-   /*
-* The ftrace kprobe handler leaves it up to us to re-enable
-* preemption here before returning if we've modified the ip.
-*/
-   if (ret)
-   preempt_enable_no_resched();
-   }
 #endif
return ret;
 }

[RFC PATCH bpf-next v2 1/4] tracing/kprobe: bpf: Check error injectable event is on function entry

2017-12-25 Thread Masami Hiramatsu

Check whether error injectable event is on function entry or not.
Currently it checks the event is ftrace-based kprobes or not,
but that is wrong. It should check if the event is on the entry
of target function. Since error injection will override a function
to just return with modified return value, that operation must
be done before the target function starts making stackframe.

As a side effect, bpf error injection is no need to depend on
function-tracer. It can work with sw-breakpoint based kprobe
events too.

Signed-off-by: Masami Hiramatsu 
---
 kernel/trace/Kconfig|2 --
 kernel/trace/bpf_trace.c|6 +++---
 kernel/trace/trace_kprobe.c |8 +---
 kernel/trace/trace_probe.h  |   12 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ae3a2d519e50..6400e1bf97c5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -533,9 +533,7 @@ config FUNCTION_PROFILER
 config BPF_KPROBE_OVERRIDE
bool "Enable BPF programs to override a kprobed function"
depends on BPF_EVENTS
-   depends on KPROBES_ON_FTRACE
depends on HAVE_KPROBE_OVERRIDE
-   depends on DYNAMIC_FTRACE_WITH_REGS
default n
help
 Allows BPF to override the execution of a probed function and
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6d2327ecb59..d663660f8392 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -800,11 +800,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
int ret = -EEXIST;
 
/*
-* Kprobe override only works for ftrace based kprobes, and only if they
-* are on the opt-in list.
+* Kprobe override only works if they are on the function entry,
+* and only if they are on the opt-in list.
 */
if (prog->kprobe_override &&
-   (!trace_kprobe_ftrace(event->tp_event) ||
+   (!trace_kprobe_on_func_entry(event->tp_event) ||
 !trace_kprobe_error_injectable(event->tp_event)))
return -EINVAL;
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 91f4b57dab82..265e3e27e8dc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -88,13 +88,15 @@ static nokprobe_inline unsigned long 
trace_kprobe_nhit(struct trace_kprobe *tk)
return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
+bool trace_kprobe_on_func_entry(struct trace_event_call *call)
 {
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-   return kprobe_ftrace(>rp.kp);
+
+   return kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.symbol_name,
+   tk->rp.kp.offset);
 }
 
-int trace_kprobe_error_injectable(struct trace_event_call *call)
+bool trace_kprobe_error_injectable(struct trace_event_call *call)
 {
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
unsigned long addr;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5e54d748c84c..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,8 +252,8 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
-int trace_kprobe_error_injectable(struct trace_event_call *call);
+bool trace_kprobe_on_func_entry(struct trace_event_call *call);
+bool trace_kprobe_error_injectable(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8NULL
@@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset)
return NULL;
 }
 
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
 {
-   return 0;
+   return false;
 }
 
-static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
+static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
 {
-   return 0;
+   return false;
 }
 #endif /* CONFIG_KPROBE_EVENTS */

[RFC PATCH bpf-next v2 0/4] Separate error injection table from kprobes

2017-12-25 Thread Masami Hiramatsu

Hi Josef and Alexei,

Here are the 2nd version of patches to moving error injection
table from kprobes. In this series I did a small fixes and
add function-based fault injection.

Here is the previous version:

https://lkml.org/lkml/2017/12/22/554

There are 2 main reasons why I separate it from kprobes.

 - kprobes users can modify execution path not only at 
   error-injection whitelist functions but also other
   functions. I don't like to suggest user that such
   limitation is from kprobes itself.

 - This error injection information is also useful for
   ftrace (function-hook) and livepatch. It should not
   be limited by CONFIG_KPROBES.

So I introduced CONFIG_FUNCTION_ERROR_INJECTION for this feature.
Also CONFIG_FAIL_FUNCTION is added, which provides function-based
error injection interface via debugfs following fault-injection
framework. See [4/4].

Any thoughts?

BTW, I think we should add an error-range description in
ALLOW_ERROR_INJECTION() macro. If user sets a success
return value and override it by mistake, caller must
break data or cause kernel panic.

Thank you,

---

Masami Hiramatsu (4):
  tracing/kprobe: bpf: Check error injectable event is on function entry
  tracing/kprobe: bpf: Compare instruction pointer with original one
  error-injection: Separate error-injection from kprobe
  error-injection: Support fault injection framework


 Documentation/fault-injection/fault-injection.txt |5 +
 arch/Kconfig  |2 
 arch/x86/Kconfig  |2 
 arch/x86/include/asm/error-injection.h|   12 +
 arch/x86/kernel/kprobes/ftrace.c  |   14 -
 arch/x86/lib/Makefile |2 
 arch/x86/lib/error-inject.c   |   19 ++
 fs/btrfs/disk-io.c|2 
 fs/btrfs/free-space-cache.c   |2 
 include/asm-generic/error-injection.h |   20 ++
 include/asm-generic/vmlinux.lds.h |   14 +
 include/linux/bpf.h   |   12 -
 include/linux/error-injection.h   |   21 ++
 include/linux/kprobes.h   |1 
 include/linux/module.h|6 -
 kernel/Makefile   |1 
 kernel/fail_function.c|  169 ++
 kernel/kprobes.c  |  163 -
 kernel/module.c   |8 -
 kernel/trace/Kconfig  |4 
 kernel/trace/bpf_trace.c  |9 -
 kernel/trace/trace_kprobe.c   |   32 +--
 kernel/trace/trace_probe.h|   12 +
 lib/Kconfig.debug |   14 +
 lib/Makefile  |1 
 lib/error-inject.c|  198 +
 26 files changed, 506 insertions(+), 239 deletions(-)
 create mode 100644 arch/x86/include/asm/error-injection.h
 create mode 100644 arch/x86/lib/error-inject.c
 create mode 100644 include/asm-generic/error-injection.h
 create mode 100644 include/linux/error-injection.h
 create mode 100644 kernel/fail_function.c
 create mode 100644 lib/error-inject.c

--
Masami Hiramatsu (Linaro)

Re: [PATCH net-next] virtio_net: Add ethtool stats

2017-12-25 Thread Toshiaki Makita

On 2017/12/26 1:47, Florian Fainelli wrote:
> On December 25, 2017 3:45:36 AM PST, Toshiaki Makita 
>  wrote:
>> On 2017/12/25 3:16, Stephen Hemminger wrote:
>>> On Wed, 20 Dec 2017 13:40:37 +0900
>>> Toshiaki Makita  wrote:
>>>
 +
 +static const struct virtnet_gstats virtnet_gstrings_stats[] = {
 +  { "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) },
 +  { "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) },
 +  { "rx_bytes",   VIRTNET_NETDEV_STAT(rx_bytes) },
 +  { "tx_bytes",   VIRTNET_NETDEV_STAT(tx_bytes) },
 +  { "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) },
 +  { "rx_length_errors",   VIRTNET_NETDEV_STAT(rx_length_errors) },
 +  { "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) },
 +  { "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) },
 +  { "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) },
 +};
 +
>>>
>>> Please do not merge pre-existing global stats into ethtool.
>>> It just duplicates existing functionality.
>>
>> Thanks for the feedback.
>> I thought it's handy to be able to check stats like this in ethtool as
>> well. Some drivers like ixgbe and mlx5 do similar thing.
>> I can remove these duplicate stats (unless anyone has objection).
> 
> For existing drivers, ethtool stats are kind of user ABI, so once these stats 
> get in, we should not consider removing them. For new drivers or existing 
> drivers without ethtool stats, we can do things a bit better as Stephen 
> suggested.

OK. will remove them in v2.

Thanks,
Toshiaki Makita

xfrm: Forbid state updates from changing encap type

2017-12-25 Thread Herbert Xu

Currently we allow state updates to competely replace the contents
of x->encap.  This is bad because on the user side ESP only sets up
header lengths depending on encap_type once when the state is first
created.  This could result in the header lengths getting out of
sync with the actual state configuration.

In practice key managers will never do a state update to change the
encapsulation type.  Only the port numbers need to be changed as the
peer NAT entry is updated.

Therefore this patch adds a check in xfrm_state_update to forbid
any changes to the encap_type.

Signed-off-by: Herbert Xu 

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 065d896..dc1cdde 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1533,8 +1533,12 @@ int xfrm_state_update(struct xfrm_state *x)
err = -EINVAL;
spin_lock_bh(>lock);
if (likely(x1->km.state == XFRM_STATE_VALID)) {
-   if (x->encap && x1->encap)
+   if (x->encap && x1->encap &&
+   x->encap->encap_type == x1->encap->encap_type)
memcpy(x1->encap, x->encap, sizeof(*x1->encap));
+   else if (x->encap || x1->encap)
+   goto fail;
+
if (x->coaddr && x1->coaddr) {
memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
}
@@ -1551,6 +1555,8 @@ int xfrm_state_update(struct xfrm_state *x)
x->km.state = XFRM_STATE_DEAD;
__xfrm_state_put(x);
}
+
+fail:
spin_unlock_bh(>lock);
 
xfrm_state_put(x1);
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[PATCH net-next 1/1] forcedeth: optimize the rx with likely

2017-12-25 Thread Zhu Yanjun

In the rx fastpath, the function netdev_alloc_skb rarely fails.
Therefore, a likely() optimization is added to this error check
conditional.

CC: Srinivas Eeda 
CC: Joe Jin 
CC: Junxiao Bi 
Signed-off-by: Zhu Yanjun 
---
 drivers/net/ethernet/nvidia/forcedeth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c 
b/drivers/net/ethernet/nvidia/forcedeth.c
index 49d6d78..a79b9f8 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1817,7 +1817,7 @@ static int nv_alloc_rx(struct net_device *dev)
 
while (np->put_rx.orig != less_rx) {
struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + 
NV_RX_ALLOC_PAD);
-   if (skb) {
+   if (likely(skb)) {
np->put_rx_ctx->skb = skb;
np->put_rx_ctx->dma = dma_map_single(>pci_dev->dev,
 skb->data,
@@ -1858,7 +1858,7 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
 
while (np->put_rx.ex != less_rx) {
struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + 
NV_RX_ALLOC_PAD);
-   if (skb) {
+   if (likely(skb)) {
np->put_rx_ctx->skb = skb;
np->put_rx_ctx->dma = dma_map_single(>pci_dev->dev,
 skb->data,
-- 
2.7.4

Re: iproute2 net-next

2017-12-25 Thread Stephen Hemminger

On Tue, 26 Dec 2017 06:47:43 +0200
Leon Romanovsky  wrote:

> On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote:
> > David Ahern has agreed to take over managing the net-next branch of 
> > iproute2.
> > The new location is:
> >  https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/
> >
> > In the past, I have accepted new features into iproute2 master branch, but
> > am changing the policy so that outside of the merge window (up until -rc1)
> > new features will get put into net-next to get some more review and testing
> > time. This means that things like the proposed batch streaming mode will
> > go through net-next.
> >  
> 
> Hi Stephen,
> 
> Did you consider to create one shared repo for the iproute2 to allow
> multiple committers workflow?

For now having separate trees is best, there is no need for multiple
committers the load is very light.


> It will be much convenient for the users to have one place for
> master/stable/net-next branches, instead of actually following two
> different repositories.

If you are doing network development, you already need to deal with
multiple repo's on the kernel side so there is no difference.

> 
> Example, of such shared repo:
> BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
> Bluetooth: 
> https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/
> RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/

Most of these are high volume or vendor silo'd which is not the case here.



pgpNi4qEun94v.pgp
Description: OpenPGP digital signature

[PATCH net] net/sched: Fix update of lastuse in act modules implementing stats_update

2017-12-25 Thread Roi Dayan

We need to update lastuse to to the most updated value between what
is already set and the new value.
If HW matching fails, i.e. because of an issue, the stats are not updated
but it could be that software did match and updated lastuse.

Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument")
Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to 
hardware")
Signed-off-by: Roi Dayan 
Reviewed-by: Paul Blakey 
Acked-by: Jiri Pirko 
---
 net/sched/act_gact.c   | 2 +-
 net/sched/act_mirred.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48e..a0ac42b 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 
bytes, u32 packets,
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
-   tm->lastuse = lastuse;
+   tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e5938..08b6184 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 
bytes, u32 packets,
struct tcf_t *tm = >tcf_tm;
 
_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-   tm->lastuse = lastuse;
+   tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
-- 
2.7.5

Re: iproute2 net-next

2017-12-25 Thread Leon Romanovsky

On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote:
> David Ahern has agreed to take over managing the net-next branch of iproute2.
> The new location is:
>  https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/
>
> In the past, I have accepted new features into iproute2 master branch, but
> am changing the policy so that outside of the merge window (up until -rc1)
> new features will get put into net-next to get some more review and testing
> time. This means that things like the proposed batch streaming mode will
> go through net-next.
>

Hi Stephen,

Did you consider to create one shared repo for the iproute2 to allow
multiple committers workflow?

It will be much convenient for the users to have one place for
master/stable/net-next branches, instead of actually following two
different repositories.

Example, of such shared repo:
BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
Bluetooth: 
https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/
RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/

Thanks


> Thanks
> Stephen Hemminger




signature.asc
Description: PGP signature

Re: [PATCH net-next v8 1/2] dt-bindings: net: add DT bindings for Socionext UniPhier AVE

2017-12-25 Thread Florian Fainelli

On December 24, 2017 5:10:37 PM PST, Kunihiko Hayashi 
 wrote:
>DT bindings for the AVE ethernet controller found on Socionext's
>UniPhier platforms.
>
>Signed-off-by: Kunihiko Hayashi 
>Signed-off-by: Jassi Brar 
>Acked-by: Rob Herring 

Reviewed-by: Florian Fainelli 

-- 
Florian

Re: [PATCH net] phylink: ensure AN is enabled

2017-12-25 Thread David Miller

From: Russell King 
Date: Wed, 20 Dec 2017 23:21:34 +

> Ensure that we mark AN as enabled at boot time, rather than leaving
> it disabled.  This is noticable if your SFP module is fiber, and you
> it supports faster speeds than 1G with 2.5G support in place.
> 
> Signed-off-by: Russell King 

Also applied, and both of these phlink fixes are queued up for -stable
as well.

Re: [PATCH net] phylink: ensure the PHY interface mode is appropriately set

2017-12-25 Thread David Miller

From: Russell King 
Date: Wed, 20 Dec 2017 23:21:28 +

> When setting the ethtool settings, ensure that the validated PHY
> interface mode is propagated to the current link settings, so that
> 2500BaseX can be selected.
> 
> Signed-off-by: Russell King 

Applied.

[RFC PATCH v12 3/5] mwifiex: Disable wakeup irq handling for pcie

2017-12-25 Thread Jeffy Chen

We are going to handle the wakeup irq in the pci core.

Signed-off-by: Jeffy Chen 
---

Changes in v13: None
Changes in v12: None
Changes in v11: None
Changes in v10: None
Changes in v9: None
Changes in v8: None
Changes in v7: None
Changes in v6: None
Changes in v5: None
Changes in v3: None
Changes in v2: None

 drivers/net/wireless/marvell/mwifiex/main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c 
b/drivers/net/wireless/marvell/mwifiex/main.c
index a96bd7e653bf..3cc3403b977a 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter 
*adapter)
goto err_exit;
 
adapter->dt_node = dev->of_node;
+
+   if (adapter->iface_type != MWIFIEX_PCIE)
+   goto err_exit;
+
adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0);
if (!adapter->irq_wakeup) {
dev_dbg(dev, "fail to parse irq_wakeup from device tree\n");
-- 
2.11.0

[RFC PATCH v12 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core

2017-12-25 Thread Jeffy Chen


Currently we are handling wake irq in mrvl wifi driver. Move it into
pci core.

Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi).


Changes in v13:
Fix compiler error reported by kbuild test robot 

Changes in v12:
Only add irq definitions for PCI devices and rewrite the commit message.
Enable the wake irq in noirq stage to avoid possible irq storm.

Changes in v11:
Address Brian's comments.
Only support 1-per-device PCIe WAKE# pin as suggested.
Move to pcie port as Brian suggested.

Changes in v10:
Use device_set_wakeup_capable() instead of device_set_wakeup_enable(),
since dedicated wakeirq will be lost in device_set_wakeup_enable(false).

Changes in v9:
Add section for PCI devices and rewrite the commit message.
Fix check error in .cleanup().
Move dedicated wakeirq setup to setup() callback and use
device_set_wakeup_enable() to enable/disable.
Rewrite the commit message.

Changes in v8:
Add optional "pci", and rewrite commit message.
Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal.
Rewrite the commit message.

Changes in v7:
Move PCIE_WAKE handling into pci core.

Changes in v6:
Fix device_init_wake error handling, and add some comments.

Changes in v5:
Move to pci.txt
Rebase.
Use "wakeup" instead of "wake"

Changes in v3:
Fix error handling.

Changes in v2:
Use dev_pm_set_dedicated_wake_irq.

Jeffy Chen (5):
  dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq
  of/irq: Adjust of_pci_irq parsing for multiple interrupts
  mwifiex: Disable wakeup irq handling for pcie
  PCI / PM: Add support for the PCIe WAKE# signal for OF
  arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru

 Documentation/devicetree/bindings/pci/pci.txt | 10 
 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi  | 11 ++--
 drivers/net/wireless/marvell/mwifiex/main.c   |  4 ++
 drivers/of/of_pci_irq.c   | 71 +++--
 drivers/pci/Makefile  |  1 +
 drivers/pci/pci-driver.c  | 10 
 drivers/pci/pci-of.c  | 75 +++
 include/linux/of_pci.h|  9 
 8 files changed, 183 insertions(+), 8 deletions(-)
 create mode 100644 drivers/pci/pci-of.c

-- 
2.11.0

[RFC PATCH v12 3/5] mwifiex: Disable wakeup irq handling for pcie

2017-12-25 Thread Jeffy Chen

We are going to handle the wakeup irq in the pci core.

Signed-off-by: Jeffy Chen 
---

Changes in v12: None
Changes in v11: None
Changes in v10: None
Changes in v9: None
Changes in v8: None
Changes in v7: None
Changes in v6: None
Changes in v5: None
Changes in v3: None
Changes in v2: None

 drivers/net/wireless/marvell/mwifiex/main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c 
b/drivers/net/wireless/marvell/mwifiex/main.c
index a96bd7e653bf..3cc3403b977a 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter 
*adapter)
goto err_exit;
 
adapter->dt_node = dev->of_node;
+
+   if (adapter->iface_type != MWIFIEX_PCIE)
+   goto err_exit;
+
adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0);
if (!adapter->irq_wakeup) {
dev_dbg(dev, "fail to parse irq_wakeup from device tree\n");
-- 
2.11.0

[RFC PATCH v12 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core

2017-12-25 Thread Jeffy Chen


Currently we are handling wake irq in mrvl wifi driver. Move it into
pci core.

Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi).


Changes in v12:
Enable the wake irq in noirq stage to avoid possible irq storm.

Changes in v11:
Only add irq definitions for PCI devices and rewrite the commit message.
Address Brian's comments.
Only support 1-per-device PCIe WAKE# pin as suggested.
Move to pcie port as Brian suggested.

Changes in v10:
Use device_set_wakeup_capable() instead of device_set_wakeup_enable(),
since dedicated wakeirq will be lost in device_set_wakeup_enable(false).

Changes in v9:
Add section for PCI devices and rewrite the commit message.
Fix check error in .cleanup().
Move dedicated wakeirq setup to setup() callback and use
device_set_wakeup_enable() to enable/disable.
Rewrite the commit message.

Changes in v8:
Add optional "pci", and rewrite commit message.
Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal.
Rewrite the commit message.

Changes in v7:
Move PCIE_WAKE handling into pci core.

Changes in v6:
Fix device_init_wake error handling, and add some comments.

Changes in v5:
Move to pci.txt
Rebase.
Use "wakeup" instead of "wake"

Changes in v3:
Fix error handling.

Changes in v2:
Use dev_pm_set_dedicated_wake_irq.

Jeffy Chen (5):
  dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq
  of/irq: Adjust of_pci_irq parsing for multiple interrupts
  mwifiex: Disable wakeup irq handling for pcie
  PCI / PM: Add support for the PCIe WAKE# signal for OF
  arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru

 Documentation/devicetree/bindings/pci/pci.txt | 10 
 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi  | 11 ++--
 drivers/net/wireless/marvell/mwifiex/main.c   |  4 ++
 drivers/of/of_pci_irq.c   | 74 --
 drivers/pci/Makefile  |  1 +
 drivers/pci/pci-driver.c  | 10 
 drivers/pci/pci-of.c  | 75 +++
 include/linux/of_pci.h|  9 
 8 files changed, 186 insertions(+), 8 deletions(-)
 create mode 100644 drivers/pci/pci-of.c

-- 
2.11.0

[PATCH V2 net-next] selftests/net: fix bugs in address and port initialization

2017-12-25 Thread Sowmini Varadhan

Address/port initialization should work correctly regardless
of the order in which command line arguments are supplied,
E.g, cfg_port should be used to connect to the remote host
even if it is processed after -D, src/dst address initialization
should not require that [-4|-6] be specified before
the -S or -D args, receiver should be able to bind to *.

Achieve this by making sure that the address/port structures
are initialized after all command line options are parsed.

Store cfg_port in host-byte order, and use htons()
to set up the sin_port/sin6_port before bind/connect,
so that the network system calls get the correct values
in network-byte order.

Signed-off-by: Sowmini Varadhan 
---
v2: fix fragility in address initialization as well.

 tools/testing/selftests/net/msg_zerocopy.c |   21 +++--
 1 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index 3ab6ec4..e11fe84 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -259,22 +259,28 @@ static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t 
payload_len)
return sizeof(*ip6h);
 }
 
-static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+
+static void setup_sockaddr(int domain, const char *str_addr,
+  struct sockaddr_storage *sockaddr)
 {
struct sockaddr_in6 *addr6 = (void *) sockaddr;
struct sockaddr_in *addr4 = (void *) sockaddr;
 
switch (domain) {
case PF_INET:
+   memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = AF_INET;
addr4->sin_port = htons(cfg_port);
-   if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+   if (str_addr &&
+   inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
error(1, 0, "ipv4 parse error: %s", str_addr);
break;
case PF_INET6:
+   memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = AF_INET6;
addr6->sin6_port = htons(cfg_port);
-   if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+   if (str_addr &&
+   inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
error(1, 0, "ipv6 parse error: %s", str_addr);
break;
default:
@@ -603,6 +609,7 @@ static void parse_opts(int argc, char **argv)
sizeof(struct tcphdr) -
40 /* max tcp options */;
int c;
+   char *daddr = NULL, *saddr = NULL;
 
cfg_payload_len = max_payload_len;
 
@@ -627,7 +634,7 @@ static void parse_opts(int argc, char **argv)
cfg_cpu = strtol(optarg, NULL, 0);
break;
case 'D':
-   setup_sockaddr(cfg_family, optarg, _dst_addr);
+   daddr = optarg;
break;
case 'i':
cfg_ifindex = if_nametoindex(optarg);
@@ -638,7 +645,7 @@ static void parse_opts(int argc, char **argv)
cfg_cork_mixed = true;
break;
case 'p':
-   cfg_port = htons(strtoul(optarg, NULL, 0));
+   cfg_port = strtoul(optarg, NULL, 0);
break;
case 'r':
cfg_rx = true;
@@ -647,7 +654,7 @@ static void parse_opts(int argc, char **argv)
cfg_payload_len = strtoul(optarg, NULL, 0);
break;
case 'S':
-   setup_sockaddr(cfg_family, optarg, _src_addr);
+   saddr = optarg;
break;
case 't':
cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
@@ -660,6 +667,8 @@ static void parse_opts(int argc, char **argv)
break;
}
}
+   setup_sockaddr(cfg_family, daddr, _dst_addr);
+   setup_sockaddr(cfg_family, saddr, _src_addr);
 
if (cfg_payload_len > max_payload_len)
error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
-- 
1.7.1

Re: [PATCH v5 iproute2 net-next] erspan: add erspan version II support

2017-12-25 Thread David Ahern

On 12/20/17 4:49 PM, William Tu wrote:
> The patch adds support for configuring the erspan v2, for both
> ipv4 and ipv6 erspan implementation.  Three additional fields
> are added: 'erspan_ver' for distinguishing v1 or v2, 'erspan_dir'
> for specifying direction of the mirrored traffic, and 'erspan_hwid'
> for users to set ERSPAN engine ID within a system.
> 
> As for manpage, the ERSPAN descriptions used to be under GRE, IPIP,
> SIT Type paragraph.  Since IP6GRE/IP6GRETAP also supports ERSPAN,
> the patch removes the old one, creates a separate ERSPAN paragrah,
> and adds an example.
> 
> Signed-off-by: William Tu 
> ---
> change in v5:
>   - update the "Usage: " in c file, so
> # ip link help erspan
> shows the new options

Since v4 was already applied to the new iproute2 net-next tree, please
send an update against it. Thanks,

Re: [PATCH v4 iproute2 net-next] erspan: add erspan version II support

2017-12-25 Thread David Ahern

On 12/19/17 8:01 PM, William Tu wrote:
> The patch adds support for configuring the erspan v2, for both
> ipv4 and ipv6 erspan implementation.  Three additional fields
> are added: 'erspan_ver' for distinguishing v1 or v2, 'erspan_dir'
> for specifying direction of the mirrored traffic, and 'erspan_hwid'
> for users to set ERSPAN engine ID within a system.
> 
> As for manpage, the ERSPAN descriptions used to be under GRE, IPIP,
> SIT Type paragraph.  Since IP6GRE/IP6GRETAP also supports ERSPAN,
> the patch removes the old one, creates a separate ERSPAN paragrah,
> and adds an example.
> 
> Signed-off-by: William Tu 
> ---
> change in v4:
>   - use matches instead of strcmp on ingress/egress
> change in v3:
>   - change erspan_dir 0/1 to "in[gress]/e[gress]"
>   - update manpage
> change in v2:
>   - fix typo ETH_P_ERSPAN2
>   - fix space and indent
> ---
>  include/uapi/linux/if_ether.h  |  1 +
>  include/uapi/linux/if_tunnel.h |  3 ++
>  ip/link_gre.c  | 66 --
>  ip/link_gre6.c | 67 --
>  man/man8/ip-link.8.in  | 92 
> --
>  5 files changed, 210 insertions(+), 19 deletions(-)

This was applied to iproute2 net-next:

https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/

Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization

2017-12-25 Thread Sowmini Varadhan

On (12/25/17 15:49), Willem de Bruijn wrote:
> 
> It would be good to also address the other instance of this at the
> same time: protocol family (-4 or -6) has to be set before a call to
> setup_sockaddr. Unlike this case, it hits an error() if called in the
> "incorrect" order, but that is still a crutch.

Yeah, I thought of that one too, but "usually" (at least
that's what is instinctive to me) you bind to INADDR_ANY
or ::, so this only becomes an issue on the client side. 
(And there, most people put the -4 or -6 before the address,
but I agree it would be nice to fix this..)

> But even better will be to save cfg_port, and src + dst addr optargs
> as local variables, then call the init function only after parsing when
> all state is available. Where this patch adds calls to
> init_sockaddr_port, indeed.

sure, I can put that out on V2 later this week.

[PATCH bpf-next 3/3] libbpf: break loop earlier

2017-12-25 Thread Eric Leblond

Get out of the loop when we have a match.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/libbpf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5fe8aaa2123e..d263748aa341 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -412,6 +412,7 @@ bpf_object__init_prog_names(struct bpf_object *obj)
   prog->section_name);
return -LIBBPF_ERRNO__LIBELF;
}
+   break;
}
 
if (!name) {
-- 
2.15.1

[PATCH bpf-next 0/3] add XDP loading support to libbpf

2017-12-25 Thread Eric Leblond


Hello,

This patchset adds a function to load XDP eBPF file in the libbpf
library. It gets the netlink extended ack from the driver in case
of failure and print the error to stderr.

Best regards,
--
Eric Leblond

[PATCH bpf-next 2/3] libbpf: add error reporting in XDP

2017-12-25 Thread Eric Leblond

Parse netlink ext attribute to get the error message returned by
the card.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/Build|   2 +-
 tools/lib/bpf/bpf.c|   9 +++
 tools/lib/bpf/nlattr.c | 188 +
 tools/lib/bpf/nlattr.h | 164 ++
 4 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/bpf/nlattr.c
 create mode 100644 tools/lib/bpf/nlattr.h

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index d8749756352d..64c679d67109 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o
+libbpf-y := libbpf.o bpf.o nlattr.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 1e3cfe6b9fce..cfd30a0cbce4 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -26,6 +26,7 @@
 #include 
 #include "bpf.h"
 #include "libbpf.h"
+#include "nlattr.h"
 #include 
 #include 
 #include 
@@ -436,6 +437,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
struct nlmsghdr *nh;
struct nlmsgerr *err;
socklen_t addrlen;
+   int one;
 
memset(, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
@@ -445,6 +447,12 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
return -errno;
}
 
+   if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+  , sizeof(one)) < 0) {
+   /* debug/verbose message that it is not supported */
+   fprintf(stderr, "Netlink error reporting not supported\n"); 
+   }
+
if (bind(sock, (struct sockaddr *), sizeof(sa)) < 0) {
ret = -errno;
goto cleanup;
@@ -521,6 +529,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
if (!err->error)
continue;
ret = err->error;
+   nla_dump_errormsg(nh);
goto cleanup;
case NLMSG_DONE:
break;
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
new file mode 100644
index ..962de14f74e3
--- /dev/null
+++ b/tools/lib/bpf/nlattr.c
@@ -0,0 +1,188 @@
+
+/*
+ * NETLINK  Netlink attributes
+ *
+ * Authors:Thomas Graf 
+ * Alexey Kuznetsov 
+ */
+
+#include 
+#include "nlattr.h"
+#include 
+#include 
+#include 
+
+static const __u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+   [NLA_U8]= sizeof(__u8),
+   [NLA_U16]   = sizeof(__u16),
+   [NLA_U32]   = sizeof(__u32),
+   [NLA_U64]   = sizeof(__u64),
+   [NLA_MSECS] = sizeof(__u64),
+   [NLA_NESTED]= NLA_HDRLEN,
+   [NLA_S8]= sizeof(__s8),
+   [NLA_S16]   = sizeof(__s16),
+   [NLA_S32]   = sizeof(__s32),
+   [NLA_S64]   = sizeof(__s64),
+};
+
+static int validate_nla(const struct nlattr *nla, int maxtype,
+   const struct nla_policy *policy)
+{
+   const struct nla_policy *pt;
+   int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
+
+   if (type <= 0 || type > maxtype)
+   return 0;
+
+   pt = [type];
+
+   if (pt->type > NLA_TYPE_MAX)
+   return -EINVAL;
+
+   switch (pt->type) {
+   case NLA_FLAG:
+   if (attrlen > 0)
+   return -ERANGE;
+   break;
+
+   case NLA_NUL_STRING:
+   if (pt->len)
+   minlen = min(attrlen, pt->len + 1);
+   else
+   minlen = attrlen;
+
+   if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL)
+   return -EINVAL;
+   /* fall through */
+
+   case NLA_STRING:
+   if (attrlen < 1)
+   return -ERANGE;
+
+   if (pt->len) {
+   char *buf = nla_data(nla);
+
+   if (buf[attrlen - 1] == '\0')
+   attrlen--;
+
+   if (attrlen > pt->len)
+   return -ERANGE;
+   }
+   break;
+
+   case NLA_BINARY:
+   if (pt->len && attrlen > pt->len)
+   return -ERANGE;
+   break;
+
+   case NLA_NESTED_COMPAT:
+   if (attrlen < pt->len)
+   return -ERANGE;
+   if (attrlen < NLA_ALIGN(pt->len))
+   break;
+   if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN)
+   return -ERANGE;
+   nla = nla_data(nla) + NLA_ALIGN(pt->len);
+   if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN + nla_len(nla))
+   return -ERANGE;
+   break;
+   case NLA_NESTED:
+   /* a nested attributes is allowed to be

[PATCH bpf-next 1/3] libbpf: add function to setup XDP

2017-12-25 Thread Eric Leblond

Most of the code is taken from set_link_xdp_fd() in bpf_load.c and
slightly modified to be library compliant.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/bpf.c| 126 -
 tools/lib/bpf/libbpf.c |   2 +
 tools/lib/bpf/libbpf.h |   4 ++
 3 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5128677e4117..1e3cfe6b9fce 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -25,6 +25,16 @@
 #include 
 #include 
 #include "bpf.h"
+#include "libbpf.h"
+#include 
+#include 
+#include 
+
+#ifndef IFLA_XDP_MAX
+#define IFLA_XDP   43
+#define IFLA_XDP_FD1
+#define IFLA_XDP_FLAGS 3
+#endif
 
 /*
  * When building perf, unistd.h is overridden. __NR_bpf is
@@ -46,8 +56,6 @@
 # endif
 #endif
 
-#define min(x, y) ((x) < (y) ? (x) : (y))
-
 static inline __u64 ptr_to_u64(const void *ptr)
 {
return (__u64) (unsigned long) ptr;
@@ -413,3 +421,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 
*info_len)
 
return err;
 }
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+   struct sockaddr_nl sa;
+   int sock, seq = 0, len, ret = -1;
+   char buf[4096];
+   struct nlattr *nla, *nla_xdp;
+   struct {
+   struct nlmsghdr  nh;
+   struct ifinfomsg ifinfo;
+   char attrbuf[64];
+   } req;
+   struct nlmsghdr *nh;
+   struct nlmsgerr *err;
+   socklen_t addrlen;
+
+   memset(, 0, sizeof(sa));
+   sa.nl_family = AF_NETLINK;
+
+   sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+   if (sock < 0) {
+   return -errno;
+   }
+
+   if (bind(sock, (struct sockaddr *), sizeof(sa)) < 0) {
+   ret = -errno;
+   goto cleanup;
+   }
+
+   addrlen = sizeof(sa);
+   if (getsockname(sock, (struct sockaddr *), ) < 0) {
+   ret = errno;
+   goto cleanup;
+   }
+
+   if (addrlen != sizeof(sa)) {
+   ret = errno;
+   goto cleanup;
+   }
+
+   memset(, 0, sizeof(req));
+   req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+   req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+   req.nh.nlmsg_type = RTM_SETLINK;
+   req.nh.nlmsg_pid = 0;
+   req.nh.nlmsg_seq = ++seq;
+   req.ifinfo.ifi_family = AF_UNSPEC;
+   req.ifinfo.ifi_index = ifindex;
+
+   /* started nested attribute for XDP */
+   nla = (struct nlattr *)(((char *))
+   + NLMSG_ALIGN(req.nh.nlmsg_len));
+   nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+   nla->nla_len = NLA_HDRLEN;
+
+   /* add XDP fd */
+   nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+   nla_xdp->nla_type = IFLA_XDP_FD;
+   nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+   memcpy((char *)nla_xdp + NLA_HDRLEN, , sizeof(fd));
+   nla->nla_len += nla_xdp->nla_len;
+
+   /* if user passed in any flags, add those too */
+   if (flags) {
+   nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+   nla_xdp->nla_type = IFLA_XDP_FLAGS;
+   nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+   memcpy((char *)nla_xdp + NLA_HDRLEN, , sizeof(flags));
+   nla->nla_len += nla_xdp->nla_len;
+   }
+
+   req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+   if (send(sock, , req.nh.nlmsg_len, 0) < 0) {
+   ret = -errno;
+   goto cleanup;
+   }
+
+   len = recv(sock, buf, sizeof(buf), 0);
+   if (len < 0) {
+   ret = -errno;
+   goto cleanup;
+   }
+
+   for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+nh = NLMSG_NEXT(nh, len)) {
+   if (nh->nlmsg_pid != sa.nl_pid) {
+   ret = -LIBBPF_ERRNO__WRNGPID;
+   goto cleanup;
+   }
+   if (nh->nlmsg_seq != seq) {
+   ret = -LIBBPF_ERRNO__INVSEQ;
+   goto cleanup;
+   }
+   switch (nh->nlmsg_type) {
+   case NLMSG_ERROR:
+   err = (struct nlmsgerr *)NLMSG_DATA(nh);
+   if (!err->error)
+   continue;
+   ret = err->error;
+   goto cleanup;
+   case NLMSG_DONE:
+   break;
+   default:
+   break;
+   }
+   }
+
+   ret = 0;
+
+cleanup:
+   close(sock);
+   return ret;
+}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e9c4b7cabcf2..5fe8aaa2123e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -106,6 +106,8 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(PROG2BIG)]  = "Program too big",

[PATCH v2 bpf-next 3/3] bpf: fix max call depth check

2017-12-25 Thread Alexei Starovoitov

fix off by one error in max call depth check
and add a test

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Signed-off-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c   |  4 ++--
 tools/testing/selftests/bpf/test_verifier.c | 35 +
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 738e919efdf0..52ad60b3b8be 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2126,9 +2126,9 @@ static int check_func_call(struct bpf_verifier_env *env, 
struct bpf_insn *insn,
struct bpf_func_state *caller, *callee;
int i, subprog, target_insn;
 
-   if (state->curframe >= MAX_CALL_FRAMES) {
+   if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
-   state->curframe);
+   state->curframe + 2);
return -E2BIG;
}
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index b5a7a6c530dc..5d0a574ce270 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8885,6 +8885,41 @@ static struct bpf_test tests[] = {
.errstr = "combined stack",
},
{
+   "calls: stack depth check using three frames. test5",
+   .insns = {
+   /* main */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */
+   BPF_EXIT_INSN(),
+   /* A */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */
+   BPF_EXIT_INSN(),
+   /* B */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */
+   BPF_EXIT_INSN(),
+   /* C */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */
+   BPF_EXIT_INSN(),
+   /* D */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */
+   BPF_EXIT_INSN(),
+   /* E */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */
+   BPF_EXIT_INSN(),
+   /* F */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */
+   BPF_EXIT_INSN(),
+   /* G */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
+   BPF_EXIT_INSN(),
+   /* H */
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_XDP,
+   .errstr = "call stack",
+   .result = REJECT,
+   },
+   {
"calls: spill into caller stack frame",
.insns = {
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-- 
2.9.5

[PATCH v2 bpf-next 0/3] bpf: two stack check fixes

2017-12-25 Thread Alexei Starovoitov

Jann reported an issue with stack depth tracking. Fix it and add tests.
Also fix off-by-one error in MAX_CALL_FRAMES check.
This set is on top of Jann's "selftest for late caller stack size increase" 
test.

Alexei Starovoitov (3):
  bpf: fix maximum stack depth tracking logic
  selftests/bpf: additional stack depth tests
  bpf: fix max call depth check

 include/linux/bpf_verifier.h|   1 +
 kernel/bpf/verifier.c   |  86 +++
 tools/testing/selftests/bpf/test_verifier.c | 156 
 3 files changed, 225 insertions(+), 18 deletions(-)

-- 
2.9.5

[PATCH v2 bpf-next 1/3] bpf: fix maximum stack depth tracking logic

2017-12-25 Thread Alexei Starovoitov

Instead of computing max stack depth for current call chain
during the main verifier pass track stack depth of each
function independently and after do_check() is done do
another pass over all instructions analyzing depth
of all possible call stacks.

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Reported-by: Jann Horn 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c| 82 +++-
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index aaac589e490c..94a02ceb1246 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -194,6 +194,7 @@ struct bpf_verifier_env {
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
struct bpf_verifer_log log;
u32 subprog_starts[BPF_MAX_SUBPROGS];
+   /* computes the stack depth of each bpf function */
u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
u32 subprog_cnt;
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 82ae580440b8..738e919efdf0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1430,33 +1430,80 @@ static int update_stack_depth(struct bpf_verifier_env 
*env,
  const struct bpf_func_state *func,
  int off)
 {
-   u16 stack = env->subprog_stack_depth[func->subprogno], total = 0;
-   struct bpf_verifier_state *cur = env->cur_state;
-   int i;
+   u16 stack = env->subprog_stack_depth[func->subprogno];
 
if (stack >= -off)
return 0;
 
/* update known max for given subprogram */
env->subprog_stack_depth[func->subprogno] = -off;
+   return 0;
+}
 
-   /* compute the total for current call chain */
-   for (i = 0; i <= cur->curframe; i++) {
-   u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno];
-
-   /* round up to 32-bytes, since this is granularity
-* of interpreter stack sizes
-*/
-   depth = round_up(depth, 32);
-   total += depth;
-   }
+/* starting from main bpf function walk all instructions of the function
+ * and recursively walk all callees that given function can call.
+ * Ignore jump and exit insns.
+ * Since recursion is prevented by check_cfg() this algorithm
+ * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+ */
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+   int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+   struct bpf_insn *insn = env->prog->insnsi;
+   int insn_cnt = env->prog->len;
+   int ret_insn[MAX_CALL_FRAMES];
+   int ret_prog[MAX_CALL_FRAMES];
 
-   if (total > MAX_BPF_STACK) {
+process_func:
+   /* round up to 32-bytes, since this is granularity
+* of interpreter stack size
+*/
+   depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+   if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too 
large\n",
-   cur->curframe, total);
+   frame + 1, depth);
return -EACCES;
}
-   return 0;
+continue_func:
+   if (env->subprog_cnt == subprog)
+   subprog_end = insn_cnt;
+   else
+   subprog_end = env->subprog_starts[subprog];
+   for (; i < subprog_end; i++) {
+   if (insn[i].code != (BPF_JMP | BPF_CALL))
+   continue;
+   if (insn[i].src_reg != BPF_PSEUDO_CALL)
+   continue;
+   /* remember insn and function to return to */
+   ret_insn[frame] = i + 1;
+   ret_prog[frame] = subprog;
+
+   /* find the callee */
+   i = i + insn[i].imm + 1;
+   subprog = find_subprog(env, i);
+   if (subprog < 0) {
+   WARN_ONCE(1, "verifier bug. No program starts at insn 
%d\n",
+ i);
+   return -EFAULT;
+   }
+   subprog++;
+   frame++;
+   if (frame >= MAX_CALL_FRAMES) {
+   WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
+   return -EFAULT;
+   }
+   goto process_func;
+   }
+   /* end of for() loop means the last insn of the 'subprog'
+* was reached. Doesn't matter whether it was JA or EXIT
+*/
+   if (frame == 0)
+   return 0;
+   depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+   frame--;
+   i = ret_insn[frame];
+   subprog = ret_prog[frame];
+   goto continue_func;
 }
 
 static int get_callee_stack_depth(struct

[PATCH v2 bpf-next 2/3] selftests/bpf: additional stack depth tests

2017-12-25 Thread Alexei Starovoitov

to test inner logic of stack depth tracking

Signed-off-by: Alexei Starovoitov 
---
 tools/testing/selftests/bpf/test_verifier.c | 121 
 1 file changed, 121 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 41dcc7dbba42..b5a7a6c530dc 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8764,6 +8764,127 @@ static struct bpf_test tests[] = {
.result = REJECT,
},
{
+   "calls: stack depth check using three frames. test1",
+   .insns = {
+   /* main */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   /* A */
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+   BPF_EXIT_INSN(),
+   /* B */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A 
*/
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_XDP,
+   /* stack_main=32, stack_A=256, stack_B=64
+* and max(main+A, main+A+B) < 512
+*/
+   .result = ACCEPT,
+   },
+   {
+   "calls: stack depth check using three frames. test2",
+   .insns = {
+   /* main */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   /* A */
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+   BPF_EXIT_INSN(),
+   /* B */
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A 
*/
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_XDP,
+   /* stack_main=32, stack_A=64, stack_B=256
+* and max(main+A, main+A+B) < 512
+*/
+   .result = ACCEPT,
+   },
+   {
+   "calls: stack depth check using three frames. test3",
+   .insns = {
+   /* main */
+   BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 8), /* call B */
+   BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 0, 1),
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   /* A */
+   BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 10, 1),
+   BPF_EXIT_INSN(),
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -224, 0),
+   BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+   /* B */
+   BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 1),
+   BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -6), /* call A 
*/
+   BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_XDP,
+   /* stack_main=64, stack_A=224, stack_B=256
+* and max(main+A, main+A+B) > 512
+*/
+   .errstr = "combined stack",
+   .result = REJECT,
+   },
+   {
+   "calls: stack depth check using three frames. test4",
+   /* void main(void) {
+*   func1(0);
+*   func1(1);
+*   func2(1);
+* }
+* void func1(int alloc_or_recurse) {
+*   if (alloc_or_recurse) {
+* frame_pointer[-300] = 1;
+*   } else {
+* func2(alloc_or_recurse);
+*   }
+* }
+* void func2(int alloc_or_recurse) {
+*   if (alloc_or_recurse) {
+* frame_pointer[-300] = 1;
+*   }
+* }
+*/
+   .insns = {
+   /* main */
+   BPF_MOV64_IMM(BPF_REG_1, 0),
+

Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization

2017-12-25 Thread Willem de Bruijn

On Sun, Dec 24, 2017 at 12:23 PM, Sowmini Varadhan
 wrote:
> If -S is not used in the command line, we should
> be binding to *.. Similarly, cfg_port should be
> used to connect to the remote host even if it is processed
> after -D. Thus we need to make sure that the cfg_port in
> cfg_src_addr and cfg_dst_addr are always initialized
> after all other command line options are parsed.

Thanks, Sowmini. Yes, input parsing as is is dependent on
order, which is surprising and fragile.

It would be good to also address the other instance of this at the
same time: protocol family (-4 or -6) has to be set before a call to
setup_sockaddr. Unlike this case, it hits an error() if called in the
"incorrect" order, but that is still a crutch.

The new init_sockaddr_port duplicates most of setup_sockaddr.
More concise is to add a branch to that to skip inet_pton if !str_addr.

But even better will be to save cfg_port, and src + dst addr optargs
as local variables, then call the init function only after parsing when
all state is available. Where this patch adds calls to
init_sockaddr_port, indeed.

iproute2 net-next

2017-12-25 Thread Stephen Hemminger

David Ahern has agreed to take over managing the net-next branch of iproute2.
The new location is:
 https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/

In the past, I have accepted new features into iproute2 master branch, but
am changing the policy so that outside of the merge window (up until -rc1)
new features will get put into net-next to get some more review and testing
time. This means that things like the proposed batch streaming mode will
go through net-next.

Thanks
Stephen Hemminger


pgp7OgxiQatY9.pgp
Description: OpenPGP digital signature

Fw: [Bug 198253] New: stopping htb results in null pointer dereference

2017-12-25 Thread Stephen Hemminger



Begin forwarded message:

Date: Mon, 25 Dec 2017 08:25:44 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 198253] New: stopping htb results in null pointer dereference


https://bugzilla.kernel.org/show_bug.cgi?id=198253

Bug ID: 198253
   Summary: stopping htb results in null pointer dereference
   Product: Networking
   Version: 2.5
Kernel Version: 4.15
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: low
  Priority: P1
 Component: Other
  Assignee: step...@networkplumber.org
  Reporter: lorinc.czegl...@gmail.com
Regression: No

Created attachment 261323
  --> https://bugzilla.kernel.org/attachment.cgi?id=261323=edit  
BUG: unable to handle kernel NULL pointer dereference at 0018

the latest kernel linux415-4.15.r17121 as built by the manjaro team, using
fireqos traffic shapeing frontend with attached config produces BUG: unable to
handle kernel NULL pointer dereference at 0018

-- 
You are receiving this mail because:
You are the assignee for the bug.

Re: [PATCH net-next] virtio_net: Add ethtool stats

2017-12-25 Thread Florian Fainelli

On December 25, 2017 3:45:36 AM PST, Toshiaki Makita 
 wrote:
>On 2017/12/25 3:16, Stephen Hemminger wrote:
>> On Wed, 20 Dec 2017 13:40:37 +0900
>> Toshiaki Makita  wrote:
>> 
>>> +
>>> +static const struct virtnet_gstats virtnet_gstrings_stats[] = {
>>> +   { "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) },
>>> +   { "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) },
>>> +   { "rx_bytes",   VIRTNET_NETDEV_STAT(rx_bytes) },
>>> +   { "tx_bytes",   VIRTNET_NETDEV_STAT(tx_bytes) },
>>> +   { "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) },
>>> +   { "rx_length_errors",   VIRTNET_NETDEV_STAT(rx_length_errors) },
>>> +   { "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) },
>>> +   { "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) },
>>> +   { "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) },
>>> +};
>>> +
>> 
>> Please do not merge pre-existing global stats into ethtool.
>> It just duplicates existing functionality.
>
>Thanks for the feedback.
>I thought it's handy to be able to check stats like this in ethtool as
>well. Some drivers like ixgbe and mlx5 do similar thing.
>I can remove these duplicate stats (unless anyone has objection).

For existing drivers, ethtool stats are kind of user ABI, so once these stats 
get in, we should not consider removing them. For new drivers or existing 
drivers without ethtool stats, we can do things a bit better as Stephen 
suggested.

-- 
Florian

Re: [net 02/14] Revert "mlx5: move affinity hints assignments to generic code"

2017-12-25 Thread Sagi Grimberg




Before the offending commit, mlx5 core did the IRQ affinity itself,
and it seems that the new generic code have some drawbacks and one
of them is the lack for user ability to modify irq affinity after
the initial affinity values got assigned.

The issue is still being discussed and a solution in the new generic code
is required, until then we need to revert this patch.

This fixes the following issue:
echo  > /proc/irq//smp_affinity
fails with  -EIO

This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since

> it is used in mlx5_ib driver.

This won't work for sure because the msi_desc affinity cpumask won't
ever be populated. You need to re-implement it in mlx5 if you don't want
to break rdma ULPs that rely on it.

Re: [PATCH] flex_can: Fix checking can_dlc

2017-12-25 Thread Oliver Hartkopp


Answering myself after reading my own comment once more:

In fact the code fix seems to be correct but the commit comment was 
completely wrong which lead to my answer ...


can_dlc can hold values from 0 .. 8.

The first 4 bytes are placed in data[0..3]. When we have more(!) than 4 
bytes in can_dlc, the bytes 5..8 are placed in data[4..7].


The good thing was, that the current check was more conservative than 
your suggestion so that we did not detect any data loss.


Please send a V2 patch with corrected commit message.

Thanks,
Oliver

ps.


From: "phu.luuan" 
+ * Copyright 2017 NXP


A one-liner contribution like this tiny fix usually does not lead to an 
attribution in the copyrights. Your contribution is already perfectly 
recorded by the Signed-off-by tag.

[RFC PATCH v11 3/5] mwifiex: Disable wakeup irq handling for pcie

2017-12-25 Thread Jeffy Chen

We are going to handle the wakeup irq in the pci core.

Signed-off-by: Jeffy Chen 
---

Changes in v11: None
Changes in v10: None
Changes in v9: None
Changes in v8: None
Changes in v7: None
Changes in v6: None
Changes in v5: None
Changes in v3: None
Changes in v2: None

 drivers/net/wireless/marvell/mwifiex/main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c 
b/drivers/net/wireless/marvell/mwifiex/main.c
index a96bd7e653bf..3cc3403b977a 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter 
*adapter)
goto err_exit;
 
adapter->dt_node = dev->of_node;
+
+   if (adapter->iface_type != MWIFIEX_PCIE)
+   goto err_exit;
+
adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0);
if (!adapter->irq_wakeup) {
dev_dbg(dev, "fail to parse irq_wakeup from device tree\n");
-- 
2.11.0

[RFC PATCH v11 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core

2017-12-25 Thread Jeffy Chen


Currently we are handling wake irq in mrvl wifi driver. Move it into
pci core.

Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi).


Changes in v11:
Only add irq definitions for PCI devices and rewrite the commit message.
Address Brian's comments.
Only support 1-per-device PCIe WAKE# pin as suggested.
Move to pcie port as Brian suggested.

Changes in v10:
Use device_set_wakeup_capable() instead of device_set_wakeup_enable(),
since dedicated wakeirq will be lost in device_set_wakeup_enable(false).

Changes in v9:
Add section for PCI devices and rewrite the commit message.
Fix check error in .cleanup().
Move dedicated wakeirq setup to setup() callback and use
device_set_wakeup_enable() to enable/disable.
Rewrite the commit message.

Changes in v8:
Add optional "pci", and rewrite commit message.
Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal.
Rewrite the commit message.

Changes in v7:
Move PCIE_WAKE handling into pci core.

Changes in v6:
Fix device_init_wake error handling, and add some comments.

Changes in v5:
Move to pci.txt
Rebase.
Use "wakeup" instead of "wake"

Changes in v3:
Fix error handling.

Changes in v2:
Use dev_pm_set_dedicated_wake_irq.

Jeffy Chen (5):
  dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq
  of/irq: Adjust of_pci_irq parsing for multiple interrupts
  mwifiex: Disable wakeup irq handling for pcie
  PCI / PM: Add support for the PCIe WAKE# signal for OF
  arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru

 Documentation/devicetree/bindings/pci/pci.txt | 10 
 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi  | 11 +++--
 drivers/net/wireless/marvell/mwifiex/main.c   |  4 ++
 drivers/of/of_pci_irq.c   | 71 +--
 drivers/pci/pci-driver.c  | 10 
 include/linux/of_pci.h|  9 
 6 files changed, 107 insertions(+), 8 deletions(-)

-- 
2.11.0

Re: [PATCH net-next] virtio_net: Add ethtool stats

2017-12-25 Thread Toshiaki Makita

On 2017/12/25 3:16, Stephen Hemminger wrote:
> On Wed, 20 Dec 2017 13:40:37 +0900
> Toshiaki Makita  wrote:
> 
>> +
>> +static const struct virtnet_gstats virtnet_gstrings_stats[] = {
>> +{ "rx_packets", VIRTNET_NETDEV_STAT(rx_packets) },
>> +{ "tx_packets", VIRTNET_NETDEV_STAT(tx_packets) },
>> +{ "rx_bytes",   VIRTNET_NETDEV_STAT(rx_bytes) },
>> +{ "tx_bytes",   VIRTNET_NETDEV_STAT(tx_bytes) },
>> +{ "rx_dropped", VIRTNET_NETDEV_STAT(rx_dropped) },
>> +{ "rx_length_errors",   VIRTNET_NETDEV_STAT(rx_length_errors) },
>> +{ "rx_frame_errors",VIRTNET_NETDEV_STAT(rx_frame_errors) },
>> +{ "tx_dropped", VIRTNET_NETDEV_STAT(tx_dropped) },
>> +{ "tx_fifo_errors", VIRTNET_NETDEV_STAT(tx_fifo_errors) },
>> +};
>> +
> 
> Please do not merge pre-existing global stats into ethtool.
> It just duplicates existing functionality.

Thanks for the feedback.
I thought it's handy to be able to check stats like this in ethtool as
well. Some drivers like ixgbe and mlx5 do similar thing.
I can remove these duplicate stats (unless anyone has objection).

-- 
Toshiaki Makita

Re: [PATCH] flex_can: Fix checking can_dlc

2017-12-25 Thread Oliver Hartkopp


This patch looks wrong to me.

On 12/19/2017 09:40 AM, Luu An Phu wrote:

From: "phu.luuan" 

flexcan_start_xmit should write data to register when can_dlc > 4.
Because can_dlc is data length and it has value from 1 to 8.


No. can_dlc can contain values from 0 to 8. Even 0 is a valid DLC.


But CAN data
index has value from 0 to 7. So in case we have data in cf->data[4],
the can_dlc has value is more than 4.

Signed-off-by: Luu An Phu 
---
  drivers/net/can/flexcan.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 0626dcf..0e3ff13 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -5,6 +5,7 @@
   * Copyright (c) 2009 Sascha Hauer, Pengutronix
   * Copyright (c) 2010-2017 Pengutronix, Marc Kleine-Budde 

   * Copyright (c) 2014 David Jander, Protonic Holland
+ * Copyright 2017 NXP
   *
   * Based on code originally by Andrey Volkov 
   *
@@ -526,7 +527,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
data = be32_to_cpup((__be32 *)>data[0]);
flexcan_write(data, >tx_mb->data[0]);
}
-   if (cf->can_dlc > 3) {


This is correct as data[0 .. 3] are filled from the code above. And if 
can_dlc is greater than 3 (== 4 .. 8) the following 4 bytes are copied 
into the registers.


So everything is correct with the current code.


+   if (cf->can_dlc > 4) {
data = be32_to_cpup((__be32 *)>data[4]);
flexcan_write(data, >tx_mb->data[1]);
}



Regards,
Oliver

Re: [patch net-next 02/10] devlink: Add support for resource abstraction

2017-12-25 Thread Arkadi Sharshevsky



On 12/20/2017 09:43 PM, David Miller wrote:
> From: Jiri Pirko 
> Date: Wed, 20 Dec 2017 12:58:13 +0100
> 
>> From: Arkadi Sharshevsky 
>>
>> Add support for hardware resource abstraction over devlink. Each resource
>> is identified via id, furthermore it contains information regarding its
>> size and its related sub resources. Each resource can also provide its
>> current occupancy.
>>
>> In some cases the sizes of some resources can be changed, yet for those
>> changes to take place a hot driver reload may be needed. The reload
>> capability will be introduced in the next patch.
>>
>> Signed-off-by: Arkadi Sharshevsky 
>> Signed-off-by: Jiri Pirko 
> 
> In what units are these sizes?  If it depends upon the resource, it would
> be great to have a way to introspect the units given a resource.
> 

This is problematic. Currently the units are actually double words
(single entry is 64 bit) because this resource is a actually a memory.
So my first thought was adding an enum in UAPI of resource_units

enum resource_units {
DEVLINK_RESOURCE_UNITS_WORD,
DEVLINK_RESOURCE_UNITS_DOUBLE_WORD,
DEVLINK_RESOURCE_UNITS_ITEM, /* this is in order to define some
driver specific stuff*/
...
};

But the 'item' is too vague, because for example, we will have the
RIF bank as resource. What unit will it have? rifs? items?

Any inputs on this?


>> +struct devlink_resource_ops *resource_ops;
> 
> Const?
> 
>> +static inline int
>> +devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
> 
> Const for resource_ops?
> 
>> +int devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
> 
> Likewise.
>

Re: [patch net-next v4 00/10] net: sched: allow qdiscs to share filter block instances

2017-12-25 Thread Jiri Pirko

Sun, Dec 24, 2017 at 05:25:41PM CET, dsah...@gmail.com wrote:
>On 12/24/17 1:19 AM, Jiri Pirko wrote:
>> Sun, Dec 24, 2017 at 02:54:47AM CET, dsah...@gmail.com wrote:
>>> On 12/23/17 9:54 AM, Jiri Pirko wrote:
 So back to the example. First, we create 2 qdiscs. Both will share
 block number 22. "22" is just an identification. If we don't pass any
 block number, a new one will be generated by kernel:

 $ tc qdisc add dev ens7 ingress block 22
 
 $ tc qdisc add dev ens8 ingress block 22
 

 Now if we list the qdiscs, we will see the block index in the output:

 $ tc qdisc
 qdisc ingress : dev ens7 parent :fff1 block 22
 qdisc ingress : dev ens8 parent :fff1 block 22

 To make is more visual, the situation looks like this:

ens7 ingress qdisc ens7 ingress qdisc
   |  |
   |  |
   +-->  block 22  <--+

 Unlimited number of qdiscs may share the same block.

 Now we can add filter to any of qdiscs sharing the same block:

 $ tc filter add dev ens7 ingress protocol ip pref 25 flower dst_ip 
 192.168.0.0/16 action drop
>>>
>>>
>>> Allowing config of a shared block through any qdisc that references it
>>> is akin to me allowing nexthop objects to be manipulated by any route
>>> that references it -- sure, it could be done but causes a lot surprises
>>> to the user.
>>>
>>> You are adding a new tc object -- a shared block. Why the resistance to
>>> creating a proper API for managing it?
>> 
>> Again, no resistance, I said many times it would be done as a follow-up.
>> But as an api already exists, it has to continue to work. Or do you
>> suggest it should stop working? That, I don't agree with.
>> 
>
>That is exactly what I am saying - principle of least surprise. The new
>object brings its own API and can only be modified using the new API.
>The scheme above can and will surprise users. You are thinking like a tc
>developer, someone intimately familiar with the code, and not like an
>ordinary user of this new feature.

Breaking exising tools is newer good. Note that not only about filter
add/del iface but also dump and notifications. I agree to extend the api
for the "block handle", sure, but the existing api should continue to
work.

Re: [QUESTION] Doubt about NAPI_GRO_CB(skb)->is_atomic in tcpv4 gro process

2017-12-25 Thread Yunsheng Lin

Hi, Alexander

On 2017/12/23 0:32, Alexander Duyck wrote:
> On Fri, Dec 22, 2017 at 12:49 AM, Yunsheng Lin  wrote:
>> Hi, Alexander
>>
>> On 2017/12/22 0:29, Alexander Duyck wrote:
>>> On Thu, Dec 21, 2017 at 1:16 AM, Yunsheng Lin  
>>> wrote:
 Hi, Alexander

 On 2017/12/21 0:24, Alexander Duyck wrote:
> On Wed, Dec 20, 2017 at 1:09 AM, Yunsheng Lin  
> wrote:
>> Hi, all
>> I have some doubt about NAPI_GRO_CB(skb)->is_atomic when
>> analyzing the tcpv4 gro process:
>>
>> Firstly we set NAPI_GRO_CB(skb)->is_atomic to 1 in dev_gro_receive:
>> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/core/dev.c#L4838
>>
>> And then in inet_gro_receive, we check the NAPI_GRO_CB(skb)->is_atomic
>> before setting NAPI_GRO_CB(skb)->is_atomic according to IP_DF bit in the 
>> ip header:
>> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/ipv4/af_inet.c#L1319
>>
>> struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff 
>> *skb)
>> {
>> .
>> for (p = *head; p; p = p->next) {
>> 
>>
>> /* If the previous IP ID value was based on an atomic
>>  * datagram we can overwrite the value and ignore it.
>>  */
>> if (NAPI_GRO_CB(skb)->is_atomic)  
>> //we check it here
>> NAPI_GRO_CB(p)->flush_id = flush_id;
>> else
>> NAPI_GRO_CB(p)->flush_id |= flush_id;
>> }
>>
>> NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));  
>> //we set it here
>> NAPI_GRO_CB(skb)->flush |= flush;
>> skb_set_network_header(skb, off);
>> 
>> }
>>
>> My question is whether we should check the NAPI_GRO_CB(skb)->is_atomic 
>> or NAPI_GRO_CB(p)->is_atomic?
>> If we should check NAPI_GRO_CB(skb)->is_atomic, then maybe it is 
>> unnecessary because it is alway true.
>> If we should check NAPI_GRO_CB(p)->is_atomic, maybe there is a bug here.
>>
>> So what is the logic here? I am just start analyzing the gro, maybe I 
>> miss something obvious here.
>
> The logic there is to address the multiple IP header case where there
> are 2 or more IP headers due to things like VXLAN or GRE tunnels. So
> what will happen is that an outer IP header will end up being sent
> with DF not set and will clear the is_atomic value then we want to OR
> in the next header that is applied. It defaults to assignment on
> is_atomic because the first IP header will encounter flush_id with no
> previous configuration occupying it.

 I see your point now.

 But for the same flow of tunnels packet, the outer and inner ip header must
 have the same fixed id or increment id?

 For example, if we have a flow of tunnels packet which has fixed id in 
 outer
 header and increment id in inner header(the inner header does have DF flag 
 set):
>>
>> Sorry, a typo error here. I meant the inner header does *not* have DF flag 
>> set here.
>>

 1. For the first packet, NAPI_GRO_CB(skb)->is_atomic will be set to zero 
 when
 inet_gro_receive is processing the inner ip header.

 2. For the second packet, when inet_gro_receive is processing the outer ip 
 header
 which has a fixed id, NAPI_GRO_CB(p)->is_atomic is zero according to [1], 
 so
 NAPI_GRO_CB(p)->flush_id will be set to 0x, then the second packet 
 will not
 be merged to first packet in tcp_gro_receive.
>>>
>>> I'm not sure how valid your case here is. The is_atomic is only really
>>> meant to apply to the inner-most header.
>>
>> For the new skb, NAPI_GRO_CB(skb)->is_atomic is indeed applied to the
>> inner-most header.
>>
>> What about the NAPI_GRO_CB(p)->is_atomic, p is the same skb flow already
>> merged by gro.
>>
>> Let me try if I understand it correctly:
>> when there is only one skb merged in p, then NAPI_GRO_CB(p)->is_atomic
>> is set according to the first skb' inner-most ip DF flags.
>>
>> When the second skb comes, and inet_gro_receive is processing the
>> outer-most ip, for the below code, NAPI_GRO_CB(p)->is_atomic
>> is for first skb's inner-most ip DF flags, and "iph->frag_off & htons(IP_DF)"
>> is for second skb' outer-most ip DF flags.
>> Why don't we use the first skb's outer-most ip DF flags here? I think it is
>> more logical to use first skb's outer-most ip DF flags here. But the first
>> skb's outer-most ip DF flags is lost when we get here, right?
>>
>> if (!NAPI_GRO_CB(p)->is_atomic ||
>> !(iph->frag_off & htons(IP_DF))) {
>> flush_id ^=

[patch net-next 0/2] net: sched: Fix RED qdisc offload flag

2017-12-25 Thread Nogah Frankel

Replacing the RED offload flag (TC_RED_OFFLOADED) with the generic one
(TCQ_F_OFFLOADED) deleted some of the logic behind it. This patchset fixes
this problem.

Nogah Frankel (2):
  net_sch: red: Fix the new offload indication
  net: sched: Move offload check till after dump call

 net/sched/sch_api.c |  5 ++---
 net/sched/sch_red.c | 26 ++
 2 files changed, 16 insertions(+), 15 deletions(-)

-- 
2.4.3

[patch net-next 2/2] net: sched: Move offload check till after dump call

2017-12-25 Thread Nogah Frankel

Move the check of the offload state to after the qdisc dump action was
called, so the qdisc could update it if it was changed.

Fixes: 7a4fa29106d9 ("net: sched: Add TCA_HW_OFFLOAD")
Signed-off-by: Nogah Frankel 
Reviewed-by: Yuval Mintz 
---
 net/sched/sch_api.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3a3a1da..758f132 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -807,11 +807,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct 
Qdisc *q, u32 clid,
tcm->tcm_info = refcount_read(>refcnt);
if (nla_put_string(skb, TCA_KIND, q->ops->id))
goto nla_put_failure;
-   if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
-   goto nla_put_failure;
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
-
+   if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+   goto nla_put_failure;
qlen = qdisc_qlen_sum(q);
 
stab = rtnl_dereference(q->stab);
-- 
2.4.3

[patch net-next 1/2] net_sch: red: Fix the new offload indication

2017-12-25 Thread Nogah Frankel

Update the offload flag, TCQ_F_OFFLOADED, in each dump call (and ignore
the offloading function return value in relation to this flag).
This is done because a qdisc is being initialized, and therefore offloaded
before being grafted. Since the ability of the driver to offload the qdisc
depends on its location, a qdisc can be offloaded and un-offloaded by graft
calls, that doesn't effect the qdisc itself.

Fixes: 428a68af3a7c ("net: sched: Move to new offload indication in RED"
Signed-off-by: Nogah Frankel 
Reviewed-by: Yuval Mintz 
---
 net/sched/sch_red.c | 26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index ec0bd36..a392eaa 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,7 +157,6 @@ static int red_offload(struct Qdisc *sch, bool enable)
.handle = sch->handle,
.parent = sch->parent,
};
-   int err;
 
if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
return -EOPNOTSUPP;
@@ -172,14 +171,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.command = TC_RED_DESTROY;
}
 
-   err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, );
-
-   if (!err && enable)
-   sch->flags |= TCQ_F_OFFLOADED;
-   else
-   sch->flags &= ~TCQ_F_OFFLOADED;
-
-   return err;
+   return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, );
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -297,12 +289,22 @@ static int red_dump_offload_stats(struct Qdisc *sch, 
struct tc_red_qopt *opt)
.stats.qstats = >qstats,
},
};
+   int err;
+
+   sch->flags &= ~TCQ_F_OFFLOADED;
 
-   if (!(sch->flags & TCQ_F_OFFLOADED))
+   if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+   return 0;
+
+   err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+   _stats);
+   if (err == -EOPNOTSUPP)
return 0;
 
-   return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-_stats);
+   if (!err)
+   sch->flags |= TCQ_F_OFFLOADED;
+
+   return err;
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
2.4.3

[patch iproute2 v3 1/4] lib/libnetlink: Add a function rtnl_talk_msg

2017-12-25 Thread Chris Mi

rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi 
---
 include/libnetlink.h |  3 ++
 lib/libnetlink.c | 92 
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..e95fad75 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..f5f675cf 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,36 +581,21 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_check_ack(struct rtnl_handle *rtnl, struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn,
+  unsigned int seq)
 {
int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   char *buf;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct nlmsghdr *h;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
-
-   status = sendmsg(rtnl->fd, , 0);
-   if (status < 0) {
-   perror("Cannot talk to rtnetlink");
-   return -1;
-   }
 
while (1) {
status = rtnl_recvmsg(rtnl->fd, , );
@@ -698,12 +683,77 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   unsigned int seq;
+   int status;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec iov = {
+   .iov_base = n,
+   .iov_len = n->nlmsg_len
+   };
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = ,
+   .msg_iovlen = 1,
+   };
+
+   n->nlmsg_seq = seq = ++rtnl->seq;
+
+   if (answer == NULL)
+   n->nlmsg_flags |= NLM_F_ACK;
+
+   status = sendmsg(rtnl->fd, , 0);
+   if (status < 0) {
+   perror("Cannot talk to rtnetlink");
+   return -1;
+   }
+
+   return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq);
+}
+
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   int iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   struct nlmsghdr *n;
+   struct iovec *v;
+   int i, status;
+
+   for (i = 0; i < iovlen; i++) {
+   v = >msg_iov[i];
+   n = v->iov_base;
+   n->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   n->nlmsg_flags |= NLM_F_ACK;
+   }
+
+   status = sendmsg(rtnl->fd, m, 0);
+   if (status < 0) {
+   perror("Cannot talk to rtnetlink");
+   return -1;
+   }
+
+   return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq);
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
 {
return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+{
+   return __rtnl_talk_msg(rtnl, m, answer, true, NULL);
+}
+
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 struct nlmsghdr **answer,
 nl_ext_ack_fn_t errfn)
-- 
2.14.3

[patch iproute2 v3 3/4] tc: Add -bs option to batch mode

2017-12-25 Thread Chris Mi

Signed-off-by: Chris Mi 
---
 tc/m_action.c  |  91 +--
 tc/tc.c|  47 ++
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 121 +
 4 files changed, 204 insertions(+), 63 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..c4c3b862 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,88 @@ bad_val:
return ret;
 }
 
+typedef struct {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
 {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   struct rtattr *tail = NLMSG_TAIL(>n);
+
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   ret = rtnl_talk_msg(, , NULL);
+   if (ret < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +728,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
int ret = 0;
@@ -689,12 +738,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , batch_size,
+  index, send);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..7ea2fc89 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -189,20 +189,20 @@ static void usage(void)
fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT {

[patch iproute2 v3 4/4] man: Add -bs option to tc manpage

2017-12-25 Thread Chris Mi

Signed-off-by: Chris Mi 
---
 man/man8/tc.8 | 5 +
 1 file changed, 5 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..de137e16 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,11 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3

[patch iproute2 v3 2/4] utils: Add a function setcmdlinetotal

2017-12-25 Thread Chris Mi

This function calculates how many commands a batch file has and
set it to global variable cmdlinetotal.

Signed-off-by: Chris Mi 
---
 include/utils.h |  4 
 lib/utils.c | 20 
 2 files changed, 24 insertions(+)

diff --git a/include/utils.h b/include/utils.h
index d3895d56..113a8c31 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr 
*n);
 
 extern int cmdlineno;
 ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+void setcmdlinetotal(const char *name);
+
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
diff --git a/lib/utils.c b/lib/utils.c
index 7ced8c06..53ca389f 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1202,6 +1202,26 @@ ssize_t getcmdline(char **linep, size_t *lenp, FILE *in)
return cc;
 }
 
+int cmdlinetotal;
+
+void setcmdlinetotal(const char *name)
+{
+   char *line = NULL;
+   size_t len = 0;
+
+   if (name && strcmp(name, "-") != 0) {
+   if (freopen(name, "r", stdin) == NULL) {
+   fprintf(stderr, "Cannot open file \"%s\" for reading: 
%s\n",
+   name, strerror(errno));
+   return;
+   }
+   }
+
+   cmdlinetotal = 0;
+   while (getcmdline(, , stdin) != -1)
+   cmdlinetotal++;
+}
+
 /* split command line into argument vector */
 int makeargs(char *line, char *argv[], int maxargs)
 {
-- 
2.14.3

[patch iproute2 v3 0/4] tc: Add -bs option to batch mode

2017-12-25 Thread Chris Mi

Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.125s
user0m6.982s
sys 0m8.080s

With this patchset, 'tc -b $file -bs 10' exection time is:

real0m12.772s
user0m5.984s
sys 0m6.723s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,

'tc -b $file' exection time is:

real0m14.484s
user0m6.919s
sys 0m7.498s

'tc -b $file -bs 10' exection time is:

real0m11.664s
user0m6.017s
sys 0m5.578s

We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg. 
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

Chris Mi (4):
  lib/libnetlink: Add a function rtnl_talk_msg
  utils: Add a function setcmdlinetotal
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage

 include/libnetlink.h |   3 ++
 include/utils.h  |   4 ++
 lib/libnetlink.c |  92 ++-
 lib/utils.c  |  20 +
 man/man8/tc.8|   5 +++
 tc/m_action.c|  91 +-
 tc/tc.c  |  47 
 tc/tc_common.h   |   8 +++-
 tc/tc_filter.c   | 121 +--
 9 files changed, 307 insertions(+), 84 deletions(-)

-- 
2.14.3

[patch net] mlxsw: spectrum: Relax sanity checks during enslavement

2017-12-25 Thread Jiri Pirko

From: Ido Schimmel 

Since commit 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that
have uppers") the driver forbids enslavement to netdevs that already
have uppers of their own, as this can result in various ordering
problems.

This requirement proved to be too strict for some users who need to be
able to enslave ports to a bridge that already has uppers. In this case,
we can allow the enslavement if the bridge is already known to us, as
any configuration performed on top of the bridge was already reflected
to the device.

Fixes: 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have 
uppers")
Signed-off-by: Ido Schimmel 
Reported-by: Alexander Petrovskiy 
Tested-by: Alexander Petrovskiy 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   | 11 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h   |  2 ++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c |  6 ++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 9bd8d28..c3837ca 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4376,7 +4376,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct 
net_device *lower_dev,
}
if (!info->linking)
break;
-   if (netdev_has_any_upper_dev(upper_dev)) {
+   if (netdev_has_any_upper_dev(upper_dev) &&
+   (!netif_is_bridge_master(upper_dev) ||
+!mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+ upper_dev))) {
NL_SET_ERR_MSG(extack,
   "spectrum: Enslaving a port to a device 
that already has an upper device is not supported");
return -EINVAL;
@@ -4504,6 +4507,7 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct 
net_device *vlan_dev,
  u16 vid)
 {
struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+   struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
struct netdev_notifier_changeupper_info *info = ptr;
struct netlink_ext_ack *extack;
struct net_device *upper_dev;
@@ -4520,7 +4524,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct 
net_device *vlan_dev,
}
if (!info->linking)
break;
-   if (netdev_has_any_upper_dev(upper_dev)) {
+   if (netdev_has_any_upper_dev(upper_dev) &&
+   (!netif_is_bridge_master(upper_dev) ||
+!mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+ upper_dev))) {
NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a 
device that already has an upper device is not supported");
return -EINVAL;
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 432ab9b..05ce1be 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -365,6 +365,8 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port 
*mlxsw_sp_port,
 void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
struct net_device *brport_dev,
struct net_device *br_dev);
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+const struct net_device *br_dev);
 
 /* spectrum.c */
 int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 7b8548e..593ad31 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -152,6 +152,12 @@ mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge 
*bridge,
return NULL;
 }
 
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+const struct net_device *br_dev)
+{
+   return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+}
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
  struct net_device *br_dev)
-- 
2.9.5

Re: [patch net] mlxsw: spectrum_router: Fix NULL pointer deref

2017-12-25 Thread Jiri Pirko

Mon, Dec 25, 2017 at 08:57:35AM CET, j...@resnulli.us wrote:
>From: Ido Schimmel 
>
>When we remove the neighbour associated with a nexthop we should always
>refuse to write the nexthop to the adjacency table. Regardless if it is
>already present in the table or not.
>
>Otherwise, we risk dereferencing the NULL pointer that was set instead
>of the neighbour.
>
>Fixes: a7ff87acd995 ("mlxsw: spectrum_router: Implement next-hop routing")
>Signed-off-by: Ido Schimmel 
>Reported-by: Alexander Petrovskiy 
>Signed-off-by: Jiri Pirko 

Dave, could you please queue this up for 4.14.y together
with "mlxsw: spectrum: Relax sanity checks during enslavement".

Thanks!

53 matches

Mail list logo