from:"Josef Bacik"

Re: [PATCH bpf-next v4 3/5] error-injection: Separate error-injection from kprobe

2018-01-11 Thread Josef Bacik

On Thu, Jan 11, 2018 at 09:50:24AM +0900, Masami Hiramatsu wrote:
> Since error-injection framework is not limited to be used
> by kprobes, nor bpf. Other kernel subsystems can use it
> freely for checking safeness of error-injection, e.g.
> livepatch, ftrace etc.
> So this separate error-injection framework from kprobes.
> 
> Some differences has been made:
> 
> - "kprobe" word is removed from any APIs/structures.
> - BPF_ALLOW_ERROR_INJECTION() is renamed to
>   ALLOW_ERROR_INJECTION() since it is not limited for BPF too.
> - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this
>   feature. It is automatically enabled if the arch supports
>   error injection feature for kprobe or ftrace etc.
> 
> Signed-off-by: Masami Hiramatsu <mhira...@kernel.org>

Reviewed-by: Josef Bacik <jba...@fb.com>

Thanks,

Josef

Re: [PATCH bpf-next v3 5/5] error-injection: Support fault injection framework

2018-01-10 Thread Josef Bacik

On Wed, Jan 10, 2018 at 07:19:05PM +0900, Masami Hiramatsu wrote:
> Support in-kernel fault-injection framework via debugfs.
> This allows you to inject a conditional error to specified
> function using debugfs interfaces.
> 
> Here is the result of test script described in
> Documentation/fault-injection/fault-injection.txt
> 
>   ===
>   # ./test_fail_function.sh
>   1+0 records in
>   1+0 records out
>   1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0227404 s, 46.1 MB/s
>   btrfs-progs v4.4
>   See http://btrfs.wiki.kernel.org for more information.
> 
>   Label:  (null)
>   UUID:   bfa96010-12e9-4360-aed0-42eec7af5798
>   Node size:  16384
>   Sector size:4096
>   Filesystem size:1001.00MiB
>   Block group profiles:
> Data: single8.00MiB
> Metadata: DUP  58.00MiB
> System:   DUP  12.00MiB
>   SSD detected:   no
>   Incompat features:  extref, skinny-metadata
>   Number of devices:  1
>   Devices:
>  IDSIZE  PATH
>   1  1001.00MiB  /dev/loop2
> 
>   mount: mount /dev/loop2 on /opt/tmpmnt failed: Cannot allocate memory
>   SUCCESS!
>   ===
> 
> 
> Signed-off-by: Masami Hiramatsu <mhira...@kernel.org>

Reviewed-by: Josef Bacik <jba...@fb.com>

Thanks,

Josef

Re: [PATCH bpf-next v3 4/5] error-injection: Add injectable error types

2018-01-10 Thread Josef Bacik

telist[];
>  
>  static void __init populate_kernel_ei_list(void)
>  {
> @@ -157,11 +171,26 @@ static void *ei_seq_next(struct seq_file *m, void *v, 
> loff_t *pos)
>   return seq_list_next(v, _injection_list, pos);
>  }
>  
> +static const char *error_type_string(int etype)
> +{
> + switch (etype) {
> + case EI_ETYPE_NULL:
> + return "NULL";
> + case EI_ETYPE_ERRNO:
> + return "ERRNO";
> + case EI_ETYPE_ERRNO_NULL:
> + return "ERRNO_NULL";
> + default:
> + return "(unknown)";
> + }
> +}
> +
>  static int ei_seq_show(struct seq_file *m, void *v)
>  {
>   struct ei_entry *ent = list_entry(v, struct ei_entry, list);
>  
> - seq_printf(m, "%pf\n", (void *)ent->start_addr);
> + seq_printf(m, "%pf\t%s\n", (void *)ent->start_addr,
> +error_type_string(ent->etype));
>   return 0;
>  }

Lol ignore the comment in my previous review about this part.  Also I love this,

Reviewed-by: Josef Bacik <jba...@fb.com>

Thanks,

Josef

Re: [PATCH bpf-next v3 3/5] error-injection: Separate error-injection from kprobe

2018-01-10 Thread Josef Bacik

On Wed, Jan 10, 2018 at 07:18:05PM +0900, Masami Hiramatsu wrote:
> Since error-injection framework is not limited to be used
> by kprobes, nor bpf. Other kernel subsystems can use it
> freely for checking safeness of error-injection, e.g.
> livepatch, ftrace etc.
> So this separate error-injection framework from kprobes.
> 
> Some differences has been made:
> 
> - "kprobe" word is removed from any APIs/structures.
> - BPF_ALLOW_ERROR_INJECTION() is renamed to
>   ALLOW_ERROR_INJECTION() since it is not limited for BPF too.
> - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this
>   feature. It is automatically enabled if the arch supports
>   error injection feature for kprobe or ftrace etc.
> 
> Signed-off-by: Masami Hiramatsu 
> ---
>   Changes in v3:
>- Fix a build error for asmlinkage on i386 by including compiler.h
>- Fix "CONFIG_FUNCTION_ERROR_INJECT" typo.
>- Separate CONFIG_MODULES dependent code
>- Add CONFIG_KPROBES dependency for arch_deref_entry_point()
>- Call error-injection init function in late_initcall stage.
>- Fix read-side mutex lock
>- Some cosmetic cleanups
> ---
>  arch/Kconfig   |2 
>  arch/x86/Kconfig   |2 
>  arch/x86/include/asm/error-injection.h |   13 ++
>  arch/x86/kernel/kprobes/core.c |   14 --
>  arch/x86/lib/Makefile  |1 
>  arch/x86/lib/error-inject.c|   19 +++
>  fs/btrfs/disk-io.c |2 
>  fs/btrfs/free-space-cache.c|2 
>  include/asm-generic/error-injection.h  |   20 +++
>  include/asm-generic/vmlinux.lds.h  |   14 +-
>  include/linux/bpf.h|   12 --
>  include/linux/error-injection.h|   21 +++
>  include/linux/kprobes.h|1 
>  include/linux/module.h |6 -
>  kernel/kprobes.c   |  163 
>  kernel/module.c|8 +
>  kernel/trace/Kconfig   |2 
>  kernel/trace/bpf_trace.c   |2 
>  kernel/trace/trace_kprobe.c|3 
>  lib/Kconfig.debug  |4 +
>  lib/Makefile   |1 
>  lib/error-inject.c |  213 
> 
>  22 files changed, 315 insertions(+), 210 deletions(-)
>  create mode 100644 arch/x86/include/asm/error-injection.h
>  create mode 100644 arch/x86/lib/error-inject.c
>  create mode 100644 include/asm-generic/error-injection.h
>  create mode 100644 include/linux/error-injection.h
>  create mode 100644 lib/error-inject.c
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index d3f4aaf9cb7a..97376accfb14 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -196,7 +196,7 @@ config HAVE_OPTPROBES
>  config HAVE_KPROBES_ON_FTRACE
>   bool
>  
> -config HAVE_KPROBE_OVERRIDE
> +config HAVE_FUNCTION_ERROR_INJECTION
>   bool
>  
>  config HAVE_NMI
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 45dc6233f2b9..366b19cb79b7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -154,7 +154,7 @@ config X86
>   select HAVE_KERNEL_XZ
>   select HAVE_KPROBES
>   select HAVE_KPROBES_ON_FTRACE
> - select HAVE_KPROBE_OVERRIDE
> + select HAVE_FUNCTION_ERROR_INJECTION
>   select HAVE_KRETPROBES
>   select HAVE_KVM
>   select HAVE_LIVEPATCH   if X86_64
> diff --git a/arch/x86/include/asm/error-injection.h 
> b/arch/x86/include/asm/error-injection.h
> new file mode 100644
> index ..47b7a1296245
> --- /dev/null
> +++ b/arch/x86/include/asm/error-injection.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ERROR_INJECTION_H
> +#define _ASM_ERROR_INJECTION_H
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +asmlinkage void just_return_func(void);
> +void override_function_with_return(struct pt_regs *regs);
> +
> +#endif /* _ASM_ERROR_INJECTION_H */
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index b02a377d5905..bd36f3c33cd0 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -1183,17 +1183,3 @@ int arch_trampoline_kprobe(struct kprobe *p)
>  {
>   return 0;
>  }
> -
> -asmlinkage void override_func(void);
> -asm(
> - ".type override_func, @function\n"
> - "override_func:\n"
> - "   ret\n"
> - ".size override_func, .-override_func\n"
> -);
> -
> -void arch_kprobe_override_function(struct pt_regs *regs)
> -{
> - regs->ip = (unsigned long)_func;
> -}
> -NOKPROBE_SYMBOL(arch_kprobe_override_function);
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index 7b181b61170e..171377b83be1 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o

Re: [PATCH bpf-next v3 2/5] tracing/kprobe: bpf: Compare instruction pointer with original one

2018-01-10 Thread Josef Bacik

On Wed, Jan 10, 2018 at 07:17:35PM +0900, Masami Hiramatsu wrote:
> Compare instruction pointer with original one on the
> stack instead using per-cpu bpf_kprobe_override flag.
> 
> This patch also consolidates reset_current_kprobe() and
> preempt_enable_no_resched() blocks. Those can be done
> in one place.
> 
> Signed-off-by: Masami Hiramatsu <mhira...@kernel.org>

Reviewed-by: Josef Bacik <jba...@fb.com>

Thanks,

Josef

Re: [PATCH bpf-next v3 1/5] tracing/kprobe: bpf: Check error injectable event is on function entry

2018-01-10 Thread Josef Bacik

On Wed, Jan 10, 2018 at 07:17:06PM +0900, Masami Hiramatsu wrote:
> Check whether error injectable event is on function entry or not.
> Currently it checks the event is ftrace-based kprobes or not,
> but that is wrong. It should check if the event is on the entry
> of target function. Since error injection will override a function
> to just return with modified return value, that operation must
> be done before the target function starts making stackframe.
> 
> As a side effect, bpf error injection is no need to depend on
> function-tracer. It can work with sw-breakpoint based kprobe
> events too.
> 
> Signed-off-by: Masami Hiramatsu <mhira...@kernel.org>

Reviewed-by: Josef Bacik <jba...@fb.com>

Thanks,

Josef

Re: [RFC PATCH bpf-next v2 0/4] Separate error injection table from kprobes

2018-01-04 Thread Josef Bacik

On Tue, Dec 26, 2017 at 04:46:28PM +0900, Masami Hiramatsu wrote:
> Hi Josef and Alexei,
> 
> Here are the 2nd version of patches to moving error injection
> table from kprobes. In this series I did a small fixes and
> add function-based fault injection.
> 
> Here is the previous version:
> 
> https://lkml.org/lkml/2017/12/22/554
> 
> There are 2 main reasons why I separate it from kprobes.
> 
>  - kprobes users can modify execution path not only at 
>error-injection whitelist functions but also other
>functions. I don't like to suggest user that such
>limitation is from kprobes itself.
> 
>  - This error injection information is also useful for
>ftrace (function-hook) and livepatch. It should not
>be limited by CONFIG_KPROBES.
> 
> So I introduced CONFIG_FUNCTION_ERROR_INJECTION for this feature.
> Also CONFIG_FAIL_FUNCTION is added, which provides function-based
> error injection interface via debugfs following fault-injection
> framework. See [4/4].
> 
> Any thoughts?

Sorry Masami, I've been on vacation for the last two weeks.  This approach is
fine by me, if we want to allow other mechanisms other than bpf to use this
functionality then hooray.  I'll do a proper review when you post v3, just
wanted to let you know I wasn't ignoring you.  Thanks,

Josef

[PATCH] trace: reenable preemption if we modify the ip

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Things got moved around between the original bpf_override_return patches
and the final version, and now the ftrace kprobe dispatcher assumes if
you modified the ip that you also enabled preemption.  Make a comment of
this and enable preemption, this fixes the lockdep splat that happened
when using this feature.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 kernel/trace/trace_kprobe.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5db849809a56..91f4b57dab82 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1322,8 +1322,15 @@ static int kprobe_dispatcher(struct kprobe *kp, struct 
pt_regs *regs)
if (tk->tp.flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
-   if (tk->tp.flags & TP_FLAG_PROFILE)
+   if (tk->tp.flags & TP_FLAG_PROFILE) {
ret = kprobe_perf_func(tk, regs);
+   /*
+* The ftrace kprobe handler leaves it up to us to re-enable
+* preemption here before returning if we've modified the ip.
+*/
+   if (ret)
+   preempt_enable_no_resched();
+   }
 #endif
return ret;
 }
-- 
2.7.5

[PATCH v10 1/5] add infrastructure for tagging functions as error injectable

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/bpf.h   |  11 +++
 include/linux/kprobes.h   |   1 +
 include/linux/module.h|   5 ++
 kernel/kprobes.c  | 163 ++
 kernel/module.c   |   6 +-
 6 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index ee8b707d9fa9..a2e8582d094a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST(). = ALIGN(8);   
\
+   
VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;   \
+   KEEP(*(_kprobe_error_inject_list))  
\
+   VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) 
= .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -564,6 +573,7 @@
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
KPROBE_BLACKLIST()  \
+   ERROR_INJECT_LIST() \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..7f4d2a953173 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -576,4 +576,15 @@ extern const struct bpf_func_proto 
bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)   \
+static unsigned long __used\
+   __attribute__((__section__("_kprobe_error_inject_list")))   \
+   _eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9440a2fc8893..963fd364f3d6 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, 
unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index c69b49abe877..548fa09fa806 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+   unsigned int num_kprobe_ei_funcs;
+   unsigned long *kprobe_ei_funcs;
+#endif
 } cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da2ccf142358..b4aab48ad258 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned 
long hash)
return &(kretprobe_table_locks[hash].lock);
 }
 
+/* List of symbols that can be overriden for error injection. */
+static LIST_HEAD(kprobe_error_injection_list);
+static DEFINE_MUTEX(kprobe_ei_mutex);
+struct kprobe_ei_entry {
+   struct list_head list;
+   unsigned long start_addr;
+   unsigned long end_addr;
+   void *priv;
+};
+
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr)
return false;
 }
 
+bool within_kprobe_error_injection_list(unsigned long addr)
+{
+   struct kprobe_ei_entry *ent;
+
+   list_for_each_entry(ent, _error_injection_list, list) {
+   i

[PATCH v10 5/5] btrfs: allow us to inject errors at io_ctl_init

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This was instrumental in reproducing a space cache bug.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4426d1c73e50..fb1382893bfc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct 
inode *inode,
 
return 0;
 }
+BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
 
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
-- 
2.7.5

[PATCH v10 4/5] samples/bpf: add a test for bpf_override_return

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..4fb944a7ecf8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -101,6 +103,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -155,6 +158,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c223ab30293..cf446c25c0ec 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -736,7 +740,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_val

[PATCH v10 2/5] btrfs: make open_ctree error injectable

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 10a2a579cc7f..02b5f5667754 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "disk-io.h"
@@ -3123,6 +3124,7 @@ int open_ctree(struct super_block *sb,
goto fail_block_groups;
goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

[PATCH v10 3/5] bpf: add a bpf_override_function helper

2017-12-15 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 ++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h|  5 
 arch/x86/kernel/kprobes/ftrace.c | 14 +
 include/linux/filter.h   |  3 +-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 -
 kernel/bpf/core.c|  3 ++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +
 kernel/trace/Kconfig | 11 +++
 kernel/trace/bpf_trace.c | 38 
 kernel/trace/trace_kprobe.c  | 64 +++-
 kernel/trace/trace_probe.h   | 12 
 15 files changed, 165 insertions(+), 10 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..d3f4aaf9cb7a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..04d66e6fa447 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,6 +154,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9f2e3102e0bb..36abb23a7a35 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14131dd06b29..6de1fd3d0097 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..1ea748d682fd 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0062302e1285..5feb441d3dd9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
u32 jited_len;  /*

[PATCH v10 0/5] Add the ability to do BPF directed error injection

2017-12-15 Thread Josef Bacik

Just one last go around I hope, fixed the preemption thing that Darrick
reported.

v9->v10:
- the kprobe dispather now requires us to re-enable preemption if we change the
  ip ourselves, so do that.

v8->v9:
- rebased onto the bpf tree.

v7->v8:
- removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.

v6->v7:
- moved the opt-in macro to bpf.h out of kprobes.h.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

Re: [PATCH v9 0/5] Add the ability to do BPF directed error injection

2017-12-13 Thread Josef Bacik

On Wed, Dec 13, 2017 at 10:07:32AM -0800, Darrick J. Wong wrote:
> On Wed, Dec 13, 2017 at 01:03:57PM -0500, Josef Bacik wrote:
> > On Tue, Dec 12, 2017 at 03:11:50PM -0800, Darrick J. Wong wrote:
> > > On Mon, Dec 11, 2017 at 11:36:45AM -0500, Josef Bacik wrote:
> > > > This is the same as v8, just rebased onto the bpf tree.
> > > > 
> > > > v8->v9:
> > > > - rebased onto the bpf tree.
> > > > 
> > > > v7->v8:
> > > > - removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.
> > > > 
> > > > v6->v7:
> > > > - moved the opt-in macro to bpf.h out of kprobes.h.
> > > > 
> > > > v5->v6:
> > > > - add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will 
> > > > support this
> > > >   feature.  This way only functions that opt-in will be allowed to be
> > > >   overridden.
> > > > - added a btrfs patch to allow error injection for open_ctree() so that 
> > > > the bpf
> > > >   sample actually works.
> > > > 
> > > > v4->v5:
> > > > - disallow kprobe_override programs from being put in the prog map 
> > > > array so we
> > > >   don't tail call into something we didn't check.  This allows us to 
> > > > make the
> > > >   normal path still fast without a bunch of percpu operations.
> > > > 
> > > > v3->v4:
> > > > - fix a build error found by kbuild test bot (I didn't wait long enough
> > > >   apparently.)
> > > > - Added a warning message as per Daniels suggestion.
> > > > 
> > > > v2->v3:
> > > > - added a ->kprobe_override flag to bpf_prog.
> > > > - added some sanity checks to disallow attaching bpf progs that have
> > > >   ->kprobe_override set that aren't for ftrace kprobes.
> > > > - added the trace_kprobe_ftrace helper to check if the trace_event_call 
> > > > is a
> > > >   ftrace kprobe.
> > > > - renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only 
> > > > read this
> > > >   value in the kprobe path, and thus only write to it if we're 
> > > > overriding or
> > > >   clearing the override.
> > > > 
> > > > v1->v2:
> > > > - moved things around to make sure that bpf_override_return could 
> > > > really only be
> > > >   used for an ftrace kprobe.
> > > > - killed the special return values from trace_call_bpf.
> > > > - renamed pc_modified to bpf_kprobe_state so bpf_override_return could 
> > > > tell if
> > > >   it was being called from an ftrace kprobe context.
> > > > - reworked the logic in kprobe_perf_func to take advantage of 
> > > > bpf_kprobe_state.
> > > > - updated the test as per Alexei's review.
> > > > 
> > > > - Original message -
> > > > 
> > > > A lot of our error paths are not well tested because we have no good 
> > > > way of
> > > > injecting errors generically.  Some subystems (block, memory) have ways 
> > > > to
> > > > inject errors, but they are random so it's hard to get reproduceable 
> > > > results.
> > > > 
> > > > With BPF we can add determinism to our error injection.  We can use 
> > > > kprobes and
> > > > other things to verify we are injecting errors at the exact case we are 
> > > > trying
> > > > to test.  This patch gives us the tool to actual do the error injection 
> > > > part.
> > > > It is very simple, we just set the return value of the pt_regs we're 
> > > > given to
> > > > whatever we provide, and then override the PC with a dummy function 
> > > > that simply
> > > > returns.
> > > 
> > > Heh, this looks cool.  I decided to try it to see what happens, and saw
> > > a bunch of dmesg pasted in below.  Is that supposed to happen?  Or am I
> > > the only fs developer still running with lockdep enabled? :)
> > > 
> > > It looks like bpf_override_return has some sort of side effect such that
> > > we get the splat, since commenting it out makes the symptom go away.
> > > 
> > > 
> > > 
> > > --D
> > > 
> > > [ 1847.769183] BTRFS error (device (null)): open_ctree failed
> > > [ 1847.770130] BUG: sleeping function called from inva

Re: [PATCH v9 0/5] Add the ability to do BPF directed error injection

2017-12-13 Thread Josef Bacik

On Tue, Dec 12, 2017 at 03:11:50PM -0800, Darrick J. Wong wrote:
> On Mon, Dec 11, 2017 at 11:36:45AM -0500, Josef Bacik wrote:
> > This is the same as v8, just rebased onto the bpf tree.
> > 
> > v8->v9:
> > - rebased onto the bpf tree.
> > 
> > v7->v8:
> > - removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.
> > 
> > v6->v7:
> > - moved the opt-in macro to bpf.h out of kprobes.h.
> > 
> > v5->v6:
> > - add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support 
> > this
> >   feature.  This way only functions that opt-in will be allowed to be
> >   overridden.
> > - added a btrfs patch to allow error injection for open_ctree() so that the 
> > bpf
> >   sample actually works.
> > 
> > v4->v5:
> > - disallow kprobe_override programs from being put in the prog map array so 
> > we
> >   don't tail call into something we didn't check.  This allows us to make 
> > the
> >   normal path still fast without a bunch of percpu operations.
> > 
> > v3->v4:
> > - fix a build error found by kbuild test bot (I didn't wait long enough
> >   apparently.)
> > - Added a warning message as per Daniels suggestion.
> > 
> > v2->v3:
> > - added a ->kprobe_override flag to bpf_prog.
> > - added some sanity checks to disallow attaching bpf progs that have
> >   ->kprobe_override set that aren't for ftrace kprobes.
> > - added the trace_kprobe_ftrace helper to check if the trace_event_call is a
> >   ftrace kprobe.
> > - renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read 
> > this
> >   value in the kprobe path, and thus only write to it if we're overriding or
> >   clearing the override.
> > 
> > v1->v2:
> > - moved things around to make sure that bpf_override_return could really 
> > only be
> >   used for an ftrace kprobe.
> > - killed the special return values from trace_call_bpf.
> > - renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell 
> > if
> >   it was being called from an ftrace kprobe context.
> > - reworked the logic in kprobe_perf_func to take advantage of 
> > bpf_kprobe_state.
> > - updated the test as per Alexei's review.
> > 
> > - Original message -
> > 
> > A lot of our error paths are not well tested because we have no good way of
> > injecting errors generically.  Some subystems (block, memory) have ways to
> > inject errors, but they are random so it's hard to get reproduceable 
> > results.
> > 
> > With BPF we can add determinism to our error injection.  We can use kprobes 
> > and
> > other things to verify we are injecting errors at the exact case we are 
> > trying
> > to test.  This patch gives us the tool to actual do the error injection 
> > part.
> > It is very simple, we just set the return value of the pt_regs we're given 
> > to
> > whatever we provide, and then override the PC with a dummy function that 
> > simply
> > returns.
> 
> Heh, this looks cool.  I decided to try it to see what happens, and saw
> a bunch of dmesg pasted in below.  Is that supposed to happen?  Or am I
> the only fs developer still running with lockdep enabled? :)
> 
> It looks like bpf_override_return has some sort of side effect such that
> we get the splat, since commenting it out makes the symptom go away.
> 
> 
> 
> --D
> 
> [ 1847.769183] BTRFS error (device (null)): open_ctree failed
> [ 1847.770130] BUG: sleeping function called from invalid context at 
> /storage/home/djwong/cdev/work/linux-xfs/kernel/locking/rwsem.c:69
> [ 1847.771976] in_atomic(): 1, irqs_disabled(): 0, pid: 1524, name: mount
> [ 1847.773016] 1 lock held by mount/1524:
> [ 1847.773530]  #0:  (>s_umount_key#34/1){+.+.}, at: 
> [<653a9bb4>] sget_userns+0x302/0x4f0
> [ 1847.774731] Preemption disabled at:
> [ 1847.774735] [<  (null)>]   (null)
> [ 1847.777009] CPU: 2 PID: 1524 Comm: mount Tainted: GW
> 4.15.0-rc3-xfsx #3
> [ 1847.778800] Call Trace:
> [ 1847.779047]  dump_stack+0x7c/0xbe
> [ 1847.779361]  ___might_sleep+0x1f7/0x260
> [ 1847.779720]  down_write+0x29/0xb0
> [ 1847.780046]  unregister_shrinker+0x15/0x70
> [ 1847.780427]  deactivate_locked_super+0x2e/0x60
> [ 1847.780935]  btrfs_mount+0xbb6/0x1000 [btrfs]
> [ 1847.781353]  ? __lockdep_init_map+0x5c/0x1d0
> [ 1847.781750]  ? mount_fs+0xf/0x80
> [ 1847.782065]  ? alloc_vfsmnt+0x1a1/0x230
> [ 1847.782429]  mount_fs+0xf/0x80
> [ 1847.782733]  vfs_kern_mount+0x62/0x160
> [ 1847.783128]  btrfs_mo

[PATCH v9 1/5] add infrastructure for tagging functions as error injectable

2017-12-11 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/bpf.h   |  11 +++
 include/linux/kprobes.h   |   1 +
 include/linux/module.h|   5 ++
 kernel/kprobes.c  | 163 ++
 kernel/module.c   |   6 +-
 6 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index ee8b707d9fa9..a2e8582d094a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST(). = ALIGN(8);   
\
+   
VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;   \
+   KEEP(*(_kprobe_error_inject_list))  
\
+   VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) 
= .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -564,6 +573,7 @@
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
KPROBE_BLACKLIST()  \
+   ERROR_INJECT_LIST() \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..7f4d2a953173 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -576,4 +576,15 @@ extern const struct bpf_func_proto 
bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)   \
+static unsigned long __used\
+   __attribute__((__section__("_kprobe_error_inject_list")))   \
+   _eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9440a2fc8893..963fd364f3d6 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, 
unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index c69b49abe877..548fa09fa806 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+   unsigned int num_kprobe_ei_funcs;
+   unsigned long *kprobe_ei_funcs;
+#endif
 } cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da2ccf142358..b4aab48ad258 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned 
long hash)
return &(kretprobe_table_locks[hash].lock);
 }
 
+/* List of symbols that can be overriden for error injection. */
+static LIST_HEAD(kprobe_error_injection_list);
+static DEFINE_MUTEX(kprobe_ei_mutex);
+struct kprobe_ei_entry {
+   struct list_head list;
+   unsigned long start_addr;
+   unsigned long end_addr;
+   void *priv;
+};
+
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr)
return false;
 }
 
+bool within_kprobe_error_injection_list(unsigned long addr)
+{
+   struct kprobe_ei_entry *ent;
+
+   list_for_each_entry(ent, _error_injection_list, list) {
+   i

[PATCH v9 4/5] samples/bpf: add a test for bpf_override_return

2017-12-11 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..4fb944a7ecf8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -101,6 +103,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -155,6 +158,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c223ab30293..cf446c25c0ec 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -736,7 +740,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_val

[PATCH v9 2/5] btrfs: make open_ctree error injectable

2017-12-11 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 10a2a579cc7f..02b5f5667754 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "disk-io.h"
@@ -3123,6 +3124,7 @@ int open_ctree(struct super_block *sb,
goto fail_block_groups;
goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

[PATCH v9 5/5] btrfs: allow us to inject errors at io_ctl_init

2017-12-11 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This was instrumental in reproducing a space cache bug.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4426d1c73e50..fb1382893bfc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct 
inode *inode,
 
return 0;
 }
+BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
 
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
-- 
2.7.5

[PATCH v9 0/5] Add the ability to do BPF directed error injection

2017-12-11 Thread Josef Bacik

This is the same as v8, just rebased onto the bpf tree.

v8->v9:
- rebased onto the bpf tree.

v7->v8:
- removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.

v6->v7:
- moved the opt-in macro to bpf.h out of kprobes.h.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH v9 3/5] bpf: add a bpf_override_function helper

2017-12-11 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h|  5 
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 -
 kernel/bpf/core.c|  3 +++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +
 kernel/trace/Kconfig | 11 
 kernel/trace/bpf_trace.c | 38 +++
 kernel/trace/trace_kprobe.c  | 55 +++-
 kernel/trace/trace_probe.h   | 12 +
 15 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..d3f4aaf9cb7a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..04d66e6fa447 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,6 +154,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9f2e3102e0bb..36abb23a7a35 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14131dd06b29..6de1fd3d0097 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..1ea748d682fd 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0062302e1285..5feb441d3dd9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
u32 ji

[PATCH v8 3/5] bpf: add a bpf_override_function helper

2017-12-06 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h|  5 
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 -
 kernel/bpf/core.c|  3 +++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +
 kernel/trace/Kconfig | 11 
 kernel/trace/bpf_trace.c | 38 +++
 kernel/trace/trace_kprobe.c  | 55 +++-
 kernel/trace/trace_probe.h   | 12 +
 15 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
di

[PATCH v8 4/5] samples/bpf: add a test for bpf_override_return

2017-12-06 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_val

[PATCH v8 1/5] add infrastructure for tagging functions as error injectable

2017-12-06 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/bpf.h   |  11 +++
 include/linux/kprobes.h   |   1 +
 include/linux/module.h|   5 ++
 kernel/kprobes.c  | 163 ++
 kernel/module.c   |   6 +-
 6 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 8acfc1e099e1..85822804861e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST(). = ALIGN(8);   
\
+   
VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;   \
+   KEEP(*(_kprobe_error_inject_list))  
\
+   VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) 
= .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -560,6 +569,7 @@
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
KPROBE_BLACKLIST()  \
+   ERROR_INJECT_LIST() \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..552a666a338b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -530,4 +530,15 @@ extern const struct bpf_func_proto 
bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)   \
+static unsigned long __used\
+   __attribute__((__section__("_kprobe_error_inject_list")))   \
+   _eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd2684700b74..4f501cb73aec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, 
unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index fe5aa3736707..7bb1a9b9a322 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+   unsigned int num_kprobe_ei_funcs;
+   unsigned long *kprobe_ei_funcs;
+#endif
 } cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..bdd7dd724f6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned 
long hash)
return &(kretprobe_table_locks[hash].lock);
 }
 
+/* List of symbols that can be overriden for error injection. */
+static LIST_HEAD(kprobe_error_injection_list);
+static DEFINE_MUTEX(kprobe_ei_mutex);
+struct kprobe_ei_entry {
+   struct list_head list;
+   unsigned long start_addr;
+   unsigned long end_addr;
+   void *priv;
+};
+
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1392,6 +1402,17 @@ bool within_kprobe_blacklist(unsigned long addr)
return false;
 }
 
+bool within_kprobe_error_injection_list(unsigned long addr)
+{
+   struct kprobe_ei_entry *ent;
+
+   list_for_each_entry(ent, _error_injection_list, list) {
+   i

[PATCH v8 5/5] btrfs: allow us to inject errors at io_ctl_init

2017-12-06 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This was instrumental in reproducing a space cache bug.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cdc9f4015ec3..daa98dc1f844 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct 
inode *inode,
 
return 0;
 }
+BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
 
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
-- 
2.7.5

[PATCH v8 2/5] btrfs: make open_ctree error injectable

2017-12-06 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b..69d17a640b94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "disk-io.h"
@@ -3283,6 +3284,7 @@ int open_ctree(struct super_block *sb,
goto fail_block_groups;
goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

[PATCH v8 0/5] Add the ability to do BPF directed error injection

2017-12-06 Thread Josef Bacik

Jon noticed that I had a typo in my _ASM_KPROBE_ERROR_INJECT macro.  I went to
figure out why the compiler didn't catch it and it's because it was not used
anywhere.  I had copied it from the trace blacklist code without understanding
where it was used as cscope didn't find the original macro I was looking for, so
I assumed it was some voodoo and left it in place.  Turns out cscope failed me
and I didn't need the macro at all, the trace blacklist thing I was looking at
was for marking assembly functions as blacklisted and I have no intention of
marking assembly functions as error injectable at the moment.

v7->v8:
- removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.

v6->v7:
- moved the opt-in macro to bpf.h out of kprobes.h.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

Re: [PATCH v7 1/5] add infrastructure for tagging functions as error injectable

2017-11-30 Thread Josef Bacik

On Wed, Nov 29, 2017 at 05:59:39PM +0100, Daniel Borkmann wrote:
> On 11/28/2017 09:02 PM, Josef Bacik wrote:
> > On Tue, Nov 28, 2017 at 11:58:41AM -0700, Jonathan Corbet wrote:
> >> On Wed, 22 Nov 2017 16:23:30 -0500
> >> Josef Bacik <jo...@toxicpanda.com> wrote:
> >>> From: Josef Bacik <jba...@fb.com>
> >>>
> >>> Using BPF we can override kprob'ed functions and return arbitrary
> >>> values.  Obviously this can be a bit unsafe, so make this feature opt-in
> >>> for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
> >>> order to give BPF access to that function for error injection purposes.
> >>>
> >>> Signed-off-by: Josef Bacik <jba...@fb.com>
> >>> Acked-by: Ingo Molnar <mi...@kernel.org>
> >>> ---
> >>>  arch/x86/include/asm/asm.h|   6 ++
> >>>  include/asm-generic/vmlinux.lds.h |  10 +++
> >>>  include/linux/bpf.h   |  11 +++
> >>>  include/linux/kprobes.h   |   1 +
> >>>  include/linux/module.h|   5 ++
> >>>  kernel/kprobes.c  | 163 
> >>> ++
> >>>  kernel/module.c   |   6 +-
> >>>  7 files changed, 201 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> >>> index b0dc91f4bedc..340f4cc43255 100644
> >>> --- a/arch/x86/include/asm/asm.h
> >>> +++ b/arch/x86/include/asm/asm.h
> >>> @@ -85,6 +85,12 @@
> >>>   _ASM_PTR (entry);   \
> >>>   .popsection
> >>>  
> >>> +# define _ASM_KPROBE_ERROR_INJECT(entry) \
> >>> + .pushsection "_kprobe_error_inject_list","aw" ; \
> >>> + _ASM_ALIGN ;\
> >>> + _ASM_PTR (entry);   \
> >>> + .popseciton
> >>
> >> So this stuff is not my area of greatest expertise, but I do have to wonder
> >> how ".popseciton" can work ... ?
> > 
> > Well fuck, do you want me to send a increment Daniel/Alexei or resend this 
> > patch
> > fixed?  Thanks,
> 
> Sorry for late reply, please rebase + respin the whole series with
> this fixed. There were also few typos in the cover letter / commit
> messages that would be good to get fixed along the way.
> 
> Also, could you debug why this wasn't caught at compile/runtime during
> testing?
> 

Sat down to figure out what was wrong here, and realized I'm just an idiot.  I
was copying the no kprobe stuff, and my grepping did not uncover what
_ASM_NOKPROBE() was used for, so I assumed it was some auto generated magic and
just copied what it did to cover my bases.  Sat down to figure it out and it is
actually called in some assembly files (which is why cscope didn't find it).  So
we don't need _ASM_KPROBE_ERROR_INJECT at all.  I'll drop it and respin and send
it along.  Thanks,

Josef

Re: [PATCH v7 1/5] add infrastructure for tagging functions as error injectable

2017-11-28 Thread Josef Bacik

On Tue, Nov 28, 2017 at 11:58:41AM -0700, Jonathan Corbet wrote:
> On Wed, 22 Nov 2017 16:23:30 -0500
> Josef Bacik <jo...@toxicpanda.com> wrote:
> 
> > From: Josef Bacik <jba...@fb.com>
> > 
> > Using BPF we can override kprob'ed functions and return arbitrary
> > values.  Obviously this can be a bit unsafe, so make this feature opt-in
> > for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
> > order to give BPF access to that function for error injection purposes.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> > Acked-by: Ingo Molnar <mi...@kernel.org>
> > ---
> >  arch/x86/include/asm/asm.h|   6 ++
> >  include/asm-generic/vmlinux.lds.h |  10 +++
> >  include/linux/bpf.h   |  11 +++
> >  include/linux/kprobes.h   |   1 +
> >  include/linux/module.h|   5 ++
> >  kernel/kprobes.c  | 163 
> > ++
> >  kernel/module.c   |   6 +-
> >  7 files changed, 201 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> > index b0dc91f4bedc..340f4cc43255 100644
> > --- a/arch/x86/include/asm/asm.h
> > +++ b/arch/x86/include/asm/asm.h
> > @@ -85,6 +85,12 @@
> > _ASM_PTR (entry);   \
> > .popsection
> >  
> > +# define _ASM_KPROBE_ERROR_INJECT(entry)   \
> > +   .pushsection "_kprobe_error_inject_list","aw" ; \
> > +   _ASM_ALIGN ;\
> > +   _ASM_PTR (entry);   \
> > +   .popseciton
> 
> So this stuff is not my area of greatest expertise, but I do have to wonder
> how ".popseciton" can work ... ?
> 

Well fuck, do you want me to send a increment Daniel/Alexei or resend this patch
fixed?  Thanks,

Josef

[PATCH v7 5/5] btrfs: allow us to inject errors at io_ctl_init

2017-11-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This was instrumental in reproducing a space cache bug.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cdc9f4015ec3..daa98dc1f844 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct 
inode *inode,
 
return 0;
 }
+BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
 
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
-- 
2.7.5

[PATCH v7 4/5] samples/bpf: add a test for bpf_override_return

2017-11-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_val

[PATCH v7 3/5] bpf: add a bpf_override_function helper

2017-11-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h|  5 
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 -
 kernel/bpf/core.c|  3 +++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +
 kernel/trace/Kconfig | 11 
 kernel/trace/bpf_trace.c | 38 +++
 kernel/trace/trace_kprobe.c  | 55 +++-
 kernel/trace/trace_probe.h   | 12 +
 15 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
di

[PATCH v7 2/5] btrfs: make open_ctree error injectable

2017-11-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b..69d17a640b94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "disk-io.h"
@@ -3283,6 +3284,7 @@ int open_ctree(struct super_block *sb,
goto fail_block_groups;
goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

[PATCH v7 1/5] add infrastructure for tagging functions as error injectable

2017-11-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jba...@fb.com>
Acked-by: Ingo Molnar <mi...@kernel.org>
---
 arch/x86/include/asm/asm.h|   6 ++
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/bpf.h   |  11 +++
 include/linux/kprobes.h   |   1 +
 include/linux/module.h|   5 ++
 kernel/kprobes.c  | 163 ++
 kernel/module.c   |   6 +-
 7 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b0dc91f4bedc..340f4cc43255 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -85,6 +85,12 @@
_ASM_PTR (entry);   \
.popsection
 
+# define _ASM_KPROBE_ERROR_INJECT(entry)   \
+   .pushsection "_kprobe_error_inject_list","aw" ; \
+   _ASM_ALIGN ;\
+   _ASM_PTR (entry);   \
+   .popseciton
+
 .macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 8acfc1e099e1..85822804861e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST(). = ALIGN(8);   
\
+   
VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;   \
+   KEEP(*(_kprobe_error_inject_list))  
\
+   VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) 
= .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -560,6 +569,7 @@
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
KPROBE_BLACKLIST()  \
+   ERROR_INJECT_LIST() \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..552a666a338b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -530,4 +530,15 @@ extern const struct bpf_func_proto 
bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)   \
+static unsigned long __used\
+   __attribute__((__section__("_kprobe_error_inject_list")))   \
+   _eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd2684700b74..4f501cb73aec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, 
unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index fe5aa3736707..7bb1a9b9a322 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+   unsigned int num_kprobe_ei_funcs;
+   unsigned long *kprobe_ei_funcs;
+#endif
 } cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..bdd7dd724f6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned 
long hash)
r

[PATCH v7 0/4] Add the ability to do BPF directed error injection

2017-11-22 Thread Josef Bacik

This is hopefully the final version, I've addressed the comment by Igno and
added his Acks.

v6->v7:
- moved the opt-in macro to bpf.h out of kprobes.h.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH 3/4] bpf: add a bpf_override_function helper

2017-11-17 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h|  5 
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 -
 kernel/bpf/core.c|  3 +++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +
 kernel/trace/Kconfig | 11 
 kernel/trace/bpf_trace.c | 38 +++
 kernel/trace/trace_kprobe.c  | 55 +++-
 kernel/trace/trace_probe.h   | 12 +
 15 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linu

[PATCH 4/4] samples/bpf: add a test for bpf_override_return

2017-11-17 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+

[PATCH 1/4] add infrastructure for tagging functions as error injectable

2017-11-17 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/x86/include/asm/asm.h|   6 ++
 include/asm-generic/kprobes.h |   9 +++
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/kprobes.h   |   1 +
 include/linux/module.h|   5 ++
 kernel/kprobes.c  | 163 ++
 kernel/module.c   |   6 +-
 7 files changed, 199 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b0dc91f4bedc..340f4cc43255 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -85,6 +85,12 @@
_ASM_PTR (entry);   \
.popsection
 
+# define _ASM_KPROBE_ERROR_INJECT(entry)   \
+   .pushsection "_kprobe_error_inject_list","aw" ; \
+   _ASM_ALIGN ;\
+   _ASM_PTR (entry);   \
+   .popseciton
+
 .macro ALIGN_DESTINATION
/* check for bad alignment of destination */
movl %edi,%ecx
diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h
index 57af9f21d148..f96c4de5d7b0 100644
--- a/include/asm-generic/kprobes.h
+++ b/include/asm-generic/kprobes.h
@@ -22,4 +22,13 @@ static unsigned long __used  
\
 #endif
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)   \
+static unsigned long __used\
+   __attribute__((__section__("_kprobe_error_inject_list")))   \
+   _eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+
 #endif /* _ASM_GENERIC_KPROBES_H */
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 8acfc1e099e1..85822804861e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST(). = ALIGN(8);   
\
+   
VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;   \
+   KEEP(*(_kprobe_error_inject_list))  
\
+   VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) 
= .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -560,6 +569,7 @@
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
KPROBE_BLACKLIST()  \
+   ERROR_INJECT_LIST() \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd2684700b74..4f501cb73aec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, 
unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index fe5aa3736707..7bb1a9b9a322 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+   unsigned int num_kprobe_ei_funcs;
+   unsigned long *kprobe_ei_funcs;
+#endif
 } cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..7afadf07b34e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned 
long hash)
return &(kretprobe_table_locks[hash].lock);
 }
 
+/* List of symbols that can

[PATCH 2/4] btrfs: make open_ctree error injectable

2017-11-17 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab849037b..c6b4e1f07072 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
@@ -3283,6 +3284,7 @@ int open_ctree(struct super_block *sb,
goto fail_block_groups;
goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

[PATCH 0/4][v6] Add the ability to do BPF directed error injection

2017-11-17 Thread Josef Bacik

I've reworked this to be opt-in only as per Igno and Alexei.  Still needs to go
through Dave because of the bpf bits, but I need tracing guys to weigh in and
sign off on my approach please.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

Re: [PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-13 Thread Josef Bacik

On Sun, Nov 12, 2017 at 11:38:24AM +0100, Ingo Molnar wrote:
> 
> * Alexei Starovoitov  wrote:
> 
> > > One of the major advantages of having an in-kernel BPF sandbox is to 
> > > never 
> > > crash the kernel - and allowing BPF programs to just randomly modify the 
> > > return value of kernel functions sounds immensely broken to me.
> > > 
> > > (And yes, I realize that kprobes are used here as a vehicle, but the 
> > > point 
> > > remains.)
> > 
> > yeah. modifying arbitrary function return pushes bpf outside of
> > its safety guarantees and in that sense doing the same
> > override_return could be done from a kernel module if kernel
> > provides the x64 side of the facility introduced by this patch.
> > On the other side adding parts of this feature to the kernel only
> > to be used by external kernel module is quite ugly too and not
> > something that was ever done before.
> > How about we restrict this bpf_override_return() only to the functions
> > which callers expect to handle errors ?
> > We can add something similar to NOKPROBE_SYMBOL(). Like
> > ALLOW_RETURN_OVERRIDE() and on btrfs side mark the functions
> > we're going to test with this feature.
> >
> > Then 'not crashing kernel' requirement will be preserved.
> > btrfs or whatever else we will be testing with override_return
> > will be functioning in 'stress test' mode and if bpf program
> > is not careful and returns error all the time then one particular
> > subsystem (like btrfs) will not be functional, but the kernel
> > will not be crashing.
> > Thoughts?
> 
> Yeah, that approach sounds much better to me: it should be fundamentally be 
> opt-in, and should be documented that it should not be possible to crash the 
> kernel via changing the return value.
> 
> I'd make it a bit clearer in the naming what the purpose of the annotation 
> is: for 
> example would BPF_ALLOW_ERROR_INJECTION() work for you guys? I.e. I think it 
> should generally be used to change actual integer error values - or at most 
> user 
> pointers, but not kernel pointers. Not enforced in a type safe manner, but 
> the 
> naming should give enough hints?
> 
> Such return-injection BFR programs can still totally confuse user-space 
> obviously: 
> for example returning an IO error could corrupt application data - but that's 
> the 
> nature of such facilities and similar results could already be achieved via 
> ptrace 
> as well. But the result of a BPF program should never be _worse_ than ptrace, 
> in 
> terms of kernel integrity.
> 
> Note that with such a safety mechanism in place no kernel message has to be 
> generated either I suspect.
> 
> In any case, my NAK would be lifted with such an approach.
> 

I'm going to want to annotate kmalloc, so it's still going to be possible to
make things go horribly wrong, is this still going to be ok with you?  Obviously
I want to use this for btrfs, but really what I used this for originally was an
NBD problem where I had to do special handling for getting EINTR back from
kernel_sendmsg, which was a pain to trigger properly without this patch.  Opt-in
is going to make it so we're just flagging important function calls anwyay
because those are the ones that fail rarely and that we want to test, which puts
us back in the same situation you are worried about, so it doesn't make much
sense to me to do it this way.  Thanks,

Josef

Re: [PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-11 Thread Josef Bacik

On Sat, Nov 11, 2017 at 09:14:55AM +0100, Ingo Molnar wrote:
> 
> * Josef Bacik <jo...@toxicpanda.com> wrote:
> 
> > On Fri, Nov 10, 2017 at 10:34:59AM +0100, Ingo Molnar wrote:
> > > 
> > > * Josef Bacik <jo...@toxicpanda.com> wrote:
> > > 
> > > > @@ -551,6 +578,10 @@ static const struct bpf_func_proto 
> > > > *kprobe_prog_func_proto(enum bpf_func_id func
> > > > return _get_stackid_proto;
> > > > case BPF_FUNC_perf_event_read_value:
> > > > return _perf_event_read_value_proto;
> > > > +   case BPF_FUNC_override_return:
> > > > +   pr_warn_ratelimited("%s[%d] is installing a program 
> > > > with bpf_override_return helper that may cause unexpected behavior!",
> > > > +   current->comm, 
> > > > task_pid_nr(current));
> > > > +   return _override_return_proto;
> > > 
> > > So if this new functionality is used we'll always print this into the 
> > > syslog?
> > > 
> > > The warning is also a bit passive aggressive about informing the user: 
> > > what 
> > > unexpected behavior can happen, what is the worst case?
> > > 
> > 
> > It's modeled after the other warnings bpf will spit out, but with this 
> > feature
> > you are skipping a function and instead returning some arbitrary value, so
> > anything could go wrong if you mess something up.  For instance I screwed 
> > up my
> > initial test case and made every IO submitted return an error instead of 
> > just on
> > the one file system I was attempting to test, so all sorts of hilarity 
> > ensued.
> 
> Ok, then for the x86 bits:
> 
>   NAK-ed-by: Ingo Molnar <mi...@kernel.org>
> 
> One of the major advantages of having an in-kernel BPF sandbox is to never 
> crash 
> the kernel - and allowing BPF programs to just randomly modify the return 
> value of 
> kernel functions sounds immensely broken to me.
> 
> (And yes, I realize that kprobes are used here as a vehicle, but the point 
> remains.)
>

Only root can use this feature, and did you read the first email?  The whole
point of this is that error path checkig fucking sucks, and this gives us the
ability to systematically check our error paths and make the kernel way more
robust than it currently is.  Can things go wrong?  Sure, that's why its a
config option and root only.  You only want to turn this on for testing and not
have it on in production.  This is a valuable tool and well worth the risk.
Thanks,

Josef

Re: [PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-10 Thread Josef Bacik

On Fri, Nov 10, 2017 at 10:34:59AM +0100, Ingo Molnar wrote:
> 
> * Josef Bacik <jo...@toxicpanda.com> wrote:
> 
> > @@ -551,6 +578,10 @@ static const struct bpf_func_proto 
> > *kprobe_prog_func_proto(enum bpf_func_id func
> > return _get_stackid_proto;
> > case BPF_FUNC_perf_event_read_value:
> > return _perf_event_read_value_proto;
> > +   case BPF_FUNC_override_return:
> > +   pr_warn_ratelimited("%s[%d] is installing a program with 
> > bpf_override_return helper that may cause unexpected behavior!",
> > +   current->comm, task_pid_nr(current));
> > +   return _override_return_proto;
> 
> So if this new functionality is used we'll always print this into the syslog?
> 
> The warning is also a bit passive aggressive about informing the user: what 
> unexpected behavior can happen, what is the worst case?
> 

It's modeled after the other warnings bpf will spit out, but with this feature
you are skipping a function and instead returning some arbitrary value, so
anything could go wrong if you mess something up.  For instance I screwed up my
initial test case and made every IO submitted return an error instead of just on
the one file system I was attempting to test, so all sorts of hilarity ensued.
Thanks,

Josef

[PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-07 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 
 arch/x86/include/asm/ptrace.h|  5 +
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 ++-
 kernel/bpf/core.c|  3 +++
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +++
 kernel/trace/Kconfig | 11 +++
 kernel/trace/bpf_trace.c | 35 +++
 kernel/trace/trace_kprobe.c  | 40 +---
 kernel/trace/trace_probe.h   |  6 ++
 15 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
diff --git a/include/linux/trace_ev

[PATCH 2/2] samples/bpf: add a test for bpf_override_return

2017-11-07 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+

[PATCH 0/2][v5] Add the ability to do BPF directed error injection

2017-11-07 Thread Josef Bacik

I'm sending this through Dave since it'll conflict with other BPF changes in his
tree, but since it touches tracing as well Dave would like a review from
somebody on the tracing side.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

Re: [PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-03 Thread Josef Bacik

On Fri, Nov 03, 2017 at 12:12:13AM +0100, Daniel Borkmann wrote:
> Hi Josef,
> 
> one more issue I just noticed, see comment below:
> 
> On 11/02/2017 03:37 PM, Josef Bacik wrote:
> [...]
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index cdd78a7beaae..dfa44fd74bae 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -458,7 +458,8 @@ struct bpf_prog {
> > locked:1,   /* Program image locked? */
> > gpl_compatible:1, /* Is filter GPL compatible? 
> > */
> > cb_access:1,/* Is control block accessed? */
> > -   dst_needed:1;   /* Do we need dst entry? */
> > +   dst_needed:1,   /* Do we need dst entry? */
> > +   kprobe_override:1; /* Do we override a kprobe? 
> > */
> > kmemcheck_bitfield_end(meta);
> > enum bpf_prog_type  type;   /* Type of BPF program */
> > u32 len;/* Number of filter blocks */
> [...]
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index d906775e12c1..f8f7927a9152 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -4189,6 +4189,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env 
> > *env)
> > prog->dst_needed = 1;
> > if (insn->imm == BPF_FUNC_get_prandom_u32)
> > bpf_user_rnd_init_once();
> > +   if (insn->imm == BPF_FUNC_override_return)
> > +   prog->kprobe_override = 1;
> > if (insn->imm == BPF_FUNC_tail_call) {
> > /* If we tail call into other programs, we
> >  * cannot make any assumptions since they can
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 9660ee65fbef..0d7fce52391d 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -8169,6 +8169,13 @@ static int perf_event_set_bpf_prog(struct perf_event 
> > *event, u32 prog_fd)
> > return -EINVAL;
> > }
> > 
> > +   /* Kprobe override only works for kprobes, not uprobes. */
> > +   if (prog->kprobe_override &&
> > +   !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
> > +   bpf_prog_put(prog);
> > +   return -EINVAL;
> > +   }
> 
> Can we somehow avoid the prog->kprobe_override flag here completely
> and also same in the perf_event_attach_bpf_prog() handler?
> 
> Reason is that it's not reliable for bailing out this way: Think of
> the main program you're attaching doesn't use bpf_override_return()
> helper, but it tail-calls into other BPF progs that make use of it
> instead. So above check would be useless and will fail and we continue
> to attach the prog for probes where it's not intended to be used.
> 
> We've had similar issues in the past e.g. c2002f983767 ("bpf: fix
> checking xdp_adjust_head on tail calls") is just one of those. Thus,
> can we avoid the flag altogether and handle such error case differently?
> 

So if I'm reading this right there's no way to know what we'll tail call at any
given point, so I need to go back to my previous iteration of this patch and
always save the state of the kprobe in the per-cpu variable to make sure we
don't use bpf_override_return in the wrong case?

The tail call functions won't be in the BPF_PROG_ARRAY right?  It'll be just
some other arbitrary function?  If that's the case then we really need something
like this

https://patchwork.kernel.org/patch/10034815/

and I need to bring that back right?  Thanks,

Josef

[PATCH 0/2][v4] Add the ability to do BPF directed error injection

2017-11-02 Thread Josef Bacik

I'm sending this through Dave since it'll conflict with other BPF changes in his
tree, but since it touches tracing as well Dave would like a review from
somebody on the tracing side.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH 2/2] samples/bpf: add a test for bpf_override_return

2017-11-02 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+

[PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-02 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 
 arch/x86/include/asm/ptrace.h|  5 +
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 ++-
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +++
 kernel/trace/Kconfig | 11 +++
 kernel/trace/bpf_trace.c | 35 +++
 kernel/trace/trace_kprobe.c  | 40 +---
 kernel/trace/trace_probe.h   |  6 ++
 14 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
inde

[PATCH 0/2][v3] Add the ability to do BPF directed error injection

2017-11-01 Thread Josef Bacik

I'm sending this through Dave since it'll conflict with other BPF changes in his
tree, but since it touches tracing as well Dave would like a review from
somebody on the tracing side.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message - 

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-01 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 
 arch/x86/include/asm/ptrace.h|  5 +
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/filter.h   |  3 ++-
 include/linux/trace_events.h |  1 +
 include/uapi/linux/bpf.h |  7 ++-
 kernel/bpf/verifier.c|  2 ++
 kernel/events/core.c |  7 +++
 kernel/trace/Kconfig | 11 +++
 kernel/trace/bpf_trace.c | 33 +
 kernel/trace/trace_kprobe.c  | 40 +---
 kernel/trace/trace_probe.h   |  2 ++
 14 files changed, 124 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cdd78a7beaae..dfa44fd74bae 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
locked:1,   /* Program image locked? */
gpl_compatible:1, /* Is filter GPL compatible? 
*/
cb_access:1,/* Is control block accessed? */
-   dst_needed:1;   /* Do we need dst entry? */
+   dst_needed:1,   /* Do we need dst entry? */
+   kprobe_override:1; /* Do we override a kprobe? 
*/
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fc6aeca945db..be8bd5a8efaa 100644
--- a/include/linux/trace_ev

[PATCH 2/2] samples/bpf: add a test for bpf_override_return

2017-11-01 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+   FN(getsockopt), \
+   FN(override

[PATCH 0/2][v2] Add the ability to do BPF directed error injection

2017-10-31 Thread Josef Bacik

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH 1/2] bpf: add a bpf_override_function helper

2017-10-31 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 
 arch/x86/include/asm/ptrace.h|  5 +
 arch/x86/kernel/kprobes/ftrace.c | 14 ++
 include/linux/trace_events.h |  7 +++
 include/uapi/linux/bpf.h |  7 ++-
 kernel/trace/Kconfig | 11 +++
 kernel/trace/bpf_trace.c | 30 
 kernel/trace/trace_kprobe.c  | 42 +---
 10 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fc6aeca945db..9179f109c49b 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -521,7 +521,14 @@ do {   
\
 #ifdef CONFIG_PERF_EVENTS
 struct perf_event;
 
+enum {
+   BPF_STATE_NORMAL_KPROBE = 0,
+   BPF_STATE_FTRACE_KPROBE,
+   BPF_STATE_MODIFIED_PC,
+};
+
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_state);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0b7b54d898bd..1ad5b87a42f6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec),

[PATCH 2/2] samples/bpf: add a test for bpf_override_return

2017-10-31 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 16 
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+   FN(getsockopt), \
+   FN(override

[PATCH 0/2] Add the ability to do BPF directed error injection

2017-10-30 Thread Josef Bacik

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

[PATCH 2/2] samples/bpf: add a test for bpf_override_return

2017-10-30 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 samples/bpf/Makefile  |  4 
 samples/bpf/test_override_return.sh   | 15 +++
 samples/bpf/tracex7_kern.c| 15 +++
 samples/bpf/tracex7_user.c| 28 
 tools/include/uapi/linux/bpf.h|  7 ++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ea2b9e6135f3..83d06bc1f710 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -100,6 +102,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -153,6 +156,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh 
b/samples/bpf/test_override_return.sh
new file mode 100755
index ..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+   echo "SUCCESS!"
+else
+   echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index ..a2f74f736e66
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,15 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   unsigned long rc = -12;
+   bpf_override_return(ctx, rc);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index ..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+   FILE *f;
+   char filename[256];
+   char command[256];
+   int ret;
+
+   snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+   if (load_bpf_file(filename)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+
+   snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+   f = popen(command, "r");
+   ret = pclose(f);
+
+   return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4a4b6e78c977..3756dde69834 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+   FN(getsockopt), \
+   FN(override

[PATCH 1/2] bpf: add a bpf_override_function helper

2017-10-30 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 arch/Kconfig |  3 +++
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/kprobes.h   |  4 
 arch/x86/include/asm/ptrace.h|  5 +
 arch/x86/kernel/kprobes/ftrace.c | 14 
 include/uapi/linux/bpf.h |  7 +-
 kernel/trace/Kconfig | 11 ++
 kernel/trace/bpf_trace.c | 47 +++-
 kernel/trace/trace.h |  6 +
 kernel/trace/trace_kprobe.c  | 23 ++--
 10 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d789a89cb32c..4fb618082259 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -195,6 +195,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
bool
 
+config HAVE_KPROBE_OVERRIDE
+   bool
+
 config HAVE_NMI
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 971feac13506..5126d2750dd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
+   select HAVE_KPROBE_OVERRIDE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH   if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 91c04c8e67fa..f04e71800c2f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -108,6 +108,11 @@ static inline unsigned long regs_return_value(struct 
pt_regs *regs)
return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long 
rc)
+{
+   regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
p->ainsn.boostable = false;
return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+   ".type override_func, @function\n"
+   "override_func:\n"
+   "   ret\n"
+   ".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+   regs->ip = (unsigned long)_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0b7b54d898bd..1ad5b87a42f6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -673,6 +673,10 @@ union bpf_attr {
  * @buf: buf to fill
  * @buf_size: size of the buf
  * Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ * @pt_regs: pointer to struct pt_regs
+ * @rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -732,7 +736,8 @@ union bpf_attr {
FN(xdp_adjust_meta),\
FN(perf_event_read_value),  \
FN(perf_prog_read_value),   \
-   FN(getsockopt),
+   FN(getsockopt), \
+   FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE

Re: [PATCH 0/3] fix reuseaddr regression

2017-09-22 Thread Josef Bacik

On Tue, Sep 19, 2017 at 01:50:56PM -0700, David Miller wrote:
> From: jo...@toxicpanda.com
> Date: Mon, 18 Sep 2017 12:28:54 -0400
> 
> > I introduced a regression when reworking the fastreuse port stuff that 
> > allows
> > bind conflicts to occur once a reuseaddr socket successfully opens on an
> > existing tb.  The root cause is I reversed an if statement which caused us 
> > to
> > set the tb as if there were no owners on the socket if there were, which
> > obviously is not correct.
> > 
> > Dave I have follow up patches that will add a selftest for this case and I 
> > ran
> > the other reuseport related tests as well.  These need to go in pretty 
> > quickly
> > as it breaks kvm, I've marked them for stable.  Sorry for the regression,
> 
> First, please fix your "From: " field so that it actually has your full
> name rather than just your email address.  This matter when I apply
> your patches.
> 
> Second, remove the stable CC:.  For networking changes, you simply ask
> me to queue the changes up for -stable.
> 

Sorry Dave, I've fixed my git email settings and I droped the stable cc and sent
a new round.  Didn't see this until just now, my bad.

Josef

[PATCH 1/3] net: set tb->fast_sk_family

2017-09-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

We need to set the tb->fast_sk_family properly so we can use the proper
comparison function for all subsequent reuseport bind requests.

Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk")
Reported-and-tested-by: Cole Robinson <crobi...@redhat.com>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b9c64b40a83a..f87f4805e244 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -328,6 +328,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
+   tb->fast_sk_family = sk->sk_family;
 #if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 #endif
@@ -354,6 +355,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
+   tb->fast_sk_family = sk->sk_family;
 #if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 #endif
-- 
2.7.4

[PATCH 0/3] fix reuseaddr regression

2017-09-22 Thread Josef Bacik

I introduced a regression when reworking the fastreuse port stuff that allows
bind conflicts to occur once a reuseaddr successfully opens on an existing tb.
The root cause is I reversed an if statement which caused us to set the tb as if
there were no owners on the socket if there were, which obviously is not
correct.

Dave could you please queue these changes up for -stable, I've run them through
the net tests and added another test to check for this problem specifically.
Thanks,

Josef

[PATCH 2/3] net: use inet6_rcv_saddr to compare sockets

2017-09-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

In ipv6_rcv_saddr_equal() we need to use inet6_rcv_saddr(sk) for the
ipv6 compare with the fast socket information to make sure we're doing
the proper comparisons.

Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk")
Reported-and-tested-by: Cole Robinson <crobi...@redhat.com>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f87f4805e244..a1bf30438bc5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct 
inet_bind_bucket *tb,
 #if IS_ENABLED(CONFIG_IPV6)
if (tb->fast_sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(>fast_v6_rcv_saddr,
-   >sk_v6_rcv_saddr,
+   inet6_rcv_saddr(sk),
tb->fast_rcv_saddr,
sk->sk_rcv_saddr,
tb->fast_ipv6_only,
-- 
2.7.4

[PATCH 3/3] inet: fix improper empty comparison

2017-09-22 Thread Josef Bacik

From: Josef Bacik <jba...@fb.com>

When doing my reuseport rework I screwed up and changed a

if (hlist_empty(>owners))

to

if (!hlist_empty(>owners))

This is obviously bad as all of the reuseport/reuse logic was reversed,
which caused weird problems like allowing an ipv4 bind conflict if we
opened an ipv4 only socket on a port followed by an ipv6 only socket on
the same port.

Fixes: b9470c27607b ("inet: kill smallest_size and smallest_port")
Reported-by: Cole Robinson <crobi...@redhat.com>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a1bf30438bc5..c039c937ba90 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -321,7 +321,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
goto fail_unlock;
}
 success:
-   if (!hlist_empty(>owners)) {
+   if (hlist_empty(>owners)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
-- 
2.7.4

Re: [PATCH 2/3] selftests: actually run the various net selftests

2017-09-19 Thread Josef Bacik

On Mon, Sep 18, 2017 at 04:14:41PM -0600, Shuah Khan wrote:
> On 09/18/2017 11:32 AM, jo...@toxicpanda.com wrote:
> > From: Josef Bacik <jba...@fb.com>
> > 
> > These self tests are just self contained binaries, they are not run by
> > any of the scripts in the directory.  This means they need to be marked
> > with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> > ---
> >  tools/testing/selftests/net/Makefile | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/net/Makefile 
> > b/tools/testing/selftests/net/Makefile
> > index 3df542c84610..45a4e77a47c4 100644
> > --- a/tools/testing/selftests/net/Makefile
> > +++ b/tools/testing/selftests/net/Makefile
> > @@ -6,8 +6,8 @@ CFLAGS += -I../../../../usr/include/
> >  TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh 
> > rtnetlink.sh
> >  TEST_GEN_FILES =  socket
> >  TEST_GEN_FILES += psock_fanout psock_tpacket
> > -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
> > -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict
> > +TEST_GEN_PROGS += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
> > +TEST_GEN_PROGS += reuseport_dualstack msg_zerocopy reuseaddr_conflict
> 
> Hmm. I see msg_zerocopy.sh for running msg_zerocopy. msg_zerocopy should
> still stay in TEST_GEN_FILES and msg_zerocopy.sh needs to be added to
> TEST_PROGS so it runs.
> 

Actually the shell script requires arguments, it doesn't just run the test.
I'll fix this to just omit the test for now as it's not setup to run properly.

Willem, could you follow up with a patch so that the zero copy test is run
properly the way you envision it running?  You need to make sure that

make -C tools/testing/selftests TARGETS=net run_tests

actually runs your zero copy test the way you expect it to, otherwise it's just
sitting there collecting dust.  Thanks,

Josef

Re: [PATCH 3/3] selftests: silence test output by default

2017-09-18 Thread Josef Bacik

On Mon, Sep 18, 2017 at 01:48:31PM -0600, Shuah Khan wrote:
> On 09/18/2017 12:24 PM, Josef Bacik wrote:
> > On Mon, Sep 18, 2017 at 12:13:40PM -0600, Shuah Khan wrote:
> >> On 09/18/2017 11:52 AM, Josef Bacik wrote:
> >>> On Mon, Sep 18, 2017 at 11:46:18AM -0600, Shuah Khan wrote:
> >>>> On 09/18/2017 11:37 AM, jo...@toxicpanda.com wrote:
> >>>>> From: Josef Bacik <jba...@fb.com>
> >>>>>
> >>>>> Some of the networking tests are very noisy and make it impossible to
> >>>>> see if we actually passed the tests as they run.  Default to suppressing
> >>>>> the output from any tests run in order to make it easier to track what
> >>>>> failed.
> >>>>>
> >>>>> Signed-off-by: Josef Bacik <jba...@fb.com>
> >>>>> --
> >>>>
> >>>> This change suppresses pass/fail wrapper output for all tests, not just 
> >>>> the
> >>>> networking tests.
> >>>>
> >>>> Could you please send me before and after results for what you are trying
> >>>> to fix.
> >>>>
> >>>
> >>> Yeah I wanted to suppress extraneous output from everybody, I just 
> >>> happened to
> >>> notice it because I was testing net.  The default thing already spits out 
> >>> what
> >>> it's running and pass/fail, there's no need to include all of the random 
> >>> output
> >>> unless the user wants to go and run the test manually.  As it is now it's
> >>> _impossible_ to tell what ran and what passed/failed because of all the 
> >>> random
> >>> output.
> >>
> >> Unfortunately kselftests have lots of users that want different things. A 
> >> recent
> >> request is to use TAP13 format for output for external parsers to be able 
> >> to parse.
> >> That is what this change to add TAP13 header does.
> >>
> >> The output you are seeing is the TAP 13 format to indicate the test has 
> >> passed.
> >>
> >> The right fix would be to suppress the Pass/Fail from the individual shell 
> >> script
> >> and have the shell script exit with error code. kselftest lib.mk will 
> >> handle the
> >> error code and print out pass/fail like it is doing now.
> >>
> >> Using the common logic will help avoid duplicate code in tests/test 
> >> scripts and
> >> also makes the pass/fail messages consistent.
> >>
> >> In the following output the individual test output can be eliminated since 
> >> lib.mk
> >> run_tests does that for you. In addition, you will also get a count of 
> >> tests at
> >> the end of the run of all tests in a test directory.
> >>
> >> TAP version 13
> >> selftests: run_netsocktests
> >> 
> >> 
> >> running socket test
> >> 
> >> [PASS]
> >> ok 1..1 selftests: run_netsocktests [PASS]
> >> selftests: run_afpackettests
> >> 
> >> must be run as root
> >> ok 1..2 selftests: run_afpackettests [PASS]
> >> selftests: test_bpf.sh
> >> 
> >> test_bpf: [FAIL]
> >> not ok 1..3 selftests:  test_bpf.sh [FAIL]
> >> selftests: netdevice.sh
> >> 
> >> SKIP: Need root privileges
> >> ok 1..4 selftests: netdevice.sh [PASS]
> >>
> >> If you eliminate that you will just see the common lib.mk results.
> >>
> >> TAP version 13
> >> selftests: run_netsocktests
> >> 
> >> ok 1..1 selftests: run_netsocktests [PASS]
> >> selftests: run_afpackettests
> >> 
> >> must be run as root
> >> ok 1..2 selftests: run_afpackettests [PASS]
> >> 
> >> selftests: test_bpf.sh
> >> 
> >> not ok 1..3 selftests:  test_bpf.sh [FAIL]
> >> selftests: netdevice.sh
> >> 
> >> SKIP: Need root privileges
> >> ok 1..4 selftests: netdevice.sh [PASS]
> >>
> >>
> >> If you would like to fix the duplicate output, please send me patches
> >> to remove pass/

Re: [PATCH 3/3] selftests: silence test output by default

2017-09-18 Thread Josef Bacik

On Mon, Sep 18, 2017 at 12:13:40PM -0600, Shuah Khan wrote:
> On 09/18/2017 11:52 AM, Josef Bacik wrote:
> > On Mon, Sep 18, 2017 at 11:46:18AM -0600, Shuah Khan wrote:
> >> On 09/18/2017 11:37 AM, jo...@toxicpanda.com wrote:
> >>> From: Josef Bacik <jba...@fb.com>
> >>>
> >>> Some of the networking tests are very noisy and make it impossible to
> >>> see if we actually passed the tests as they run.  Default to suppressing
> >>> the output from any tests run in order to make it easier to track what
> >>> failed.
> >>>
> >>> Signed-off-by: Josef Bacik <jba...@fb.com>
> >>> --
> >>
> >> This change suppresses pass/fail wrapper output for all tests, not just the
> >> networking tests.
> >>
> >> Could you please send me before and after results for what you are trying
> >> to fix.
> >>
> > 
> > Yeah I wanted to suppress extraneous output from everybody, I just happened 
> > to
> > notice it because I was testing net.  The default thing already spits out 
> > what
> > it's running and pass/fail, there's no need to include all of the random 
> > output
> > unless the user wants to go and run the test manually.  As it is now it's
> > _impossible_ to tell what ran and what passed/failed because of all the 
> > random
> > output.
> 
> Unfortunately kselftests have lots of users that want different things. A 
> recent
> request is to use TAP13 format for output for external parsers to be able to 
> parse.
> That is what this change to add TAP13 header does.
> 
> The output you are seeing is the TAP 13 format to indicate the test has 
> passed.
> 
> The right fix would be to suppress the Pass/Fail from the individual shell 
> script
> and have the shell script exit with error code. kselftest lib.mk will handle 
> the
> error code and print out pass/fail like it is doing now.
> 
> Using the common logic will help avoid duplicate code in tests/test scripts 
> and
> also makes the pass/fail messages consistent.
> 
> In the following output the individual test output can be eliminated since 
> lib.mk
> run_tests does that for you. In addition, you will also get a count of tests 
> at
> the end of the run of all tests in a test directory.
> 
> TAP version 13
> selftests: run_netsocktests
> 
> 
> running socket test
> 
> [PASS]
> ok 1..1 selftests: run_netsocktests [PASS]
> selftests: run_afpackettests
> 
> must be run as root
> ok 1..2 selftests: run_afpackettests [PASS]
> selftests: test_bpf.sh
> 
> test_bpf: [FAIL]
> not ok 1..3 selftests:  test_bpf.sh [FAIL]
> selftests: netdevice.sh
> 
> SKIP: Need root privileges
> ok 1..4 selftests: netdevice.sh [PASS]
> 
> If you eliminate that you will just see the common lib.mk results.
> 
> TAP version 13
> selftests: run_netsocktests
> 
> ok 1..1 selftests: run_netsocktests [PASS]
> selftests: run_afpackettests
> 
> must be run as root
> ok 1..2 selftests: run_afpackettests [PASS]
> 
> selftests: test_bpf.sh
> 
> not ok 1..3 selftests:  test_bpf.sh [FAIL]
> selftests: netdevice.sh
> 
> SKIP: Need root privileges
> ok 1..4 selftests: netdevice.sh [PASS]
> 
> 
> If you would like to fix the duplicate output, please send me patches
> to remove pass/fail output strings from tests instead. It is on my
> todo to do that this release.
> 

I'm confused, this is exactly what my patch does, it strips all of the
extraneous output and leaves only the TAP13 output.  Here is the output without
my suppression patch

https://da.gd/pup0

and here is the output with my suppression patch

https://da.gd/3olKj

Unless I'm missing something subtle it appears to be exactly the output you
want, without the random crap from the other tests.  The only thing I'm
redirecting is the output of the _test_ itself, $$BASENAME_TEST from what I can
tell is the actual test we're running, not the wrapper, so everything is as it
should be.  Thanks,

Josef

Re: [PATCH 3/3] selftests: silence test output by default

2017-09-18 Thread Josef Bacik

On Mon, Sep 18, 2017 at 11:46:18AM -0600, Shuah Khan wrote:
> On 09/18/2017 11:37 AM, jo...@toxicpanda.com wrote:
> > From: Josef Bacik <jba...@fb.com>
> > 
> > Some of the networking tests are very noisy and make it impossible to
> > see if we actually passed the tests as they run.  Default to suppressing
> > the output from any tests run in order to make it easier to track what
> > failed.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> > --
> 
> This change suppresses pass/fail wrapper output for all tests, not just the
> networking tests.
> 
> Could you please send me before and after results for what you are trying
> to fix.
> 

Yeah I wanted to suppress extraneous output from everybody, I just happened to
notice it because I was testing net.  The default thing already spits out what
it's running and pass/fail, there's no need to include all of the random output
unless the user wants to go and run the test manually.  As it is now it's
_impossible_ to tell what ran and what passed/failed because of all the random
output.

Ideally kselftests would work like xfstests does and simply capture the output
to a log so you could go check afterwards, but that's a lot more work.  Making
it easier to tell which tests passed/failed is a good enough first step.
Thanks,

Josef

Re: 319554f284dd ("inet: don't use sk_v6_rcv_saddr directly") causes bind port regression

2017-09-15 Thread Josef Bacik

Finally got access to a box to run this down myself.  This patch on top of the 
other patches fixes the problem for me, could you verify it works for you?  
Thanks,

Josef

On 9/13/17, 3:49 PM, "Cole Robinson" <crobi...@redhat.com> wrote:

On 09/13/2017 03:44 PM, Josef Bacik wrote:
> Alright thanks, this should fix it.
> 

Still no luck with all three patches applied to fedora 4.12.8-300 RPM. Pretty
sure I didn't mess up the testing but since I rarely do kernel builds it's not
impossible...

Thanks,
Cole

0001-net-call-sk_reuseport_match-if-we-are-a-reusesock.patch
Description: 0001-net-call-sk_reuseport_match-if-we-are-a-reusesock.patch

Re: 319554f284dd ("inet: don't use sk_v6_rcv_saddr directly") causes bind port regression

2017-09-13 Thread Josef Bacik


> On Sep 13, 2017, at 12:46 PM, Chuck Ebbert <cebbert.l...@gmail.com> wrote:
> 
> On Wed, 13 Sep 2017 17:28:25 +0000
> Josef Bacik <jba...@fb.com> wrote:
> 
>> Sorry I thought I had made this other fix, can you apply this on top
>> of the other one and try that?  I have more things to try if this
>> doesn’t work, sorry you are playing go between, but I want to make
>> sure I know _which_ fix actually fixes the problem, and then clean up
>> in followup patches.  Thanks,
>> 
>> Josef
>> 
>> On 9/13/17, 8:45 AM, "Laura Abbott" <labb...@redhat.com> wrote:
>> 
>> On 09/12/2017 04:12 PM, Josef Bacik wrote:
>>> First I’m super sorry for the top post, I’m at plumbers and I
>>> forgot to upload my muttrc to my new cloud instance, so I’m screwed
>>> using outlook.
>>> 
>>> I have a completely untested, uncompiled patch that I think will
>>> fix the problem, would you mind giving it a go?  Thanks,
>>> 
>>> Josef  
>> 
>> Thanks for the quick turnaround. Unfortunately, the problem is still
>> reproducible according to the reporter.
>> 
>> Thanks,
>> Laura
> 
> I am confused by the patch that originally caused this:
> 
>if (sk->sk_family == AF_INET6)
>return ipv6_rcv_saddr_equal(>sk_v6_rcv_saddr,
> -   >sk_v6_rcv_saddr,
> +   inet6_rcv_saddr(sk2),
>sk->sk_rcv_saddr,
>sk2->sk_rcv_saddr,
> 
> Shouldn't the first argument also be changed to use inet6_rcv_saddr()?

No we know sk is IPv6 so it's alright to use directly.  Thanks,

Josef

Re: 319554f284dd ("inet: don't use sk_v6_rcv_saddr directly") causes bind port regression

2017-09-13 Thread Josef Bacik

Alright thanks, this should fix it.

Josef

On 9/13/17, 12:14 PM, "Cole Robinson" <crobi...@redhat.com> wrote:

On 09/13/2017 01:40 PM, Cole Robinson wrote:
> On 09/13/2017 01:28 PM, Josef Bacik wrote:
>> Sorry I thought I had made this other fix, can you apply this on top of the 
>> other one and try that?  I have more things to try if this doesn’t work, 
>> sorry you are playing go between, but I want to make sure I know _which_ fix 
>> actually fixes the problem, and then clean up in followup patches.  Thanks,
>>
> 
> I'm the bug reporter. I'll combine the two patches and report back
> 

Nope, issue is still present with both patches applied. Tried my own build and
a package Laura provided

Thanks,
Cole





0001-net-don-t-fast-patch-mismatched-sockets-in-STRICT-mo.patch
Description: 0001-net-don-t-fast-patch-mismatched-sockets-in-STRICT-mo.patch

Re: 319554f284dd ("inet: don't use sk_v6_rcv_saddr directly") causes bind port regression

2017-09-13 Thread Josef Bacik

Sorry I thought I had made this other fix, can you apply this on top of the 
other one and try that?  I have more things to try if this doesn’t work, sorry 
you are playing go between, but I want to make sure I know _which_ fix actually 
fixes the problem, and then clean up in followup patches.  Thanks,

Josef

On 9/13/17, 8:45 AM, "Laura Abbott" <labb...@redhat.com> wrote:

On 09/12/2017 04:12 PM, Josef Bacik wrote:
> First I’m super sorry for the top post, I’m at plumbers and I forgot to 
> upload my muttrc to my new cloud instance, so I’m screwed using outlook.
> 
> I have a completely untested, uncompiled patch that I think will fix the 
> problem, would you mind giving it a go?  Thanks,
> 
> Josef

Thanks for the quick turnaround. Unfortunately, the problem is still
reproducible according to the reporter.

Thanks,
Laura

0001-net-use-inet6_rcv_saddr-to-compare-sockets.patch
Description: 0001-net-use-inet6_rcv_saddr-to-compare-sockets.patch

Re: 319554f284dd ("inet: don't use sk_v6_rcv_saddr directly") causes bind port regression

2017-09-12 Thread Josef Bacik

First I’m super sorry for the top post, I’m at plumbers and I forgot to upload 
my muttrc to my new cloud instance, so I’m screwed using outlook.

I have a completely untested, uncompiled patch that I think will fix the 
problem, would you mind giving it a go?  Thanks,

Josef

On 9/12/17, 3:36 PM, "Laura Abbott" <labb...@redhat.com> wrote:

Hi,

Fedora got a bug report 
https://bugzilla.redhat.com/show_bug.cgi?id=1432684 of a regression with 
automatic spice port
assignment. The libvirt team reduced this to the attached test
case run as follows:

In a separate terminal, qemu-kvm -vnc 127.0.0.1:0 to grab port 5900. 
Then do this:

$ gcc bind-collision.c && ./a.out
bind: Address already in use
AF_INET check failed.
$ gcc -D CHECK_IPV6 bind-collision.c && ./a.out
AF_INET6 success
AF_INET success
$ gcc bind-collision.c && ./a.out
AF_INET success

Bisection showed this behavior to be caused by

commit 319554f284dda9f2737d09df82ba3610bd8ddea3
Author: Josef Bacik <jba...@fb.com>
Date:   Thu Jan 19 17:47:46 2017 -0500

 inet: don't use sk_v6_rcv_saddr directly

 When comparing two sockets we need to use inet6_rcv_saddr so we get 
a NULL
 sk_v6_rcv_saddr if the socket isn't AF_INET6, otherwise our 
comparison function
 can be wrong.

 Fixes: 637bc8b ("inet: reset tb->fastreuseport when adding a 
reuseport sk")
 Signed-off-by: Josef Bacik <jba...@fb.com>
 Signed-off-by: David S. Miller <da...@davemloft.net>


And reverting fixed both the standalone test case and the spice issue.

Any ideas?

Thanks,
Laura




0001-net-set-tb-fast_sk_family.patch
Description: 0001-net-set-tb-fast_sk_family.patch

Re: More BPF verifier questions

2017-06-05 Thread Josef Bacik

On Mon, Jun 05, 2017 at 11:11:05AM -0700, Alexei Starovoitov wrote:
> On 6/2/17 7:42 AM, Edward Cree wrote:
> >Also, I feel I haven't fully understood the semantics of {min,max}_value and
> > signed vs. unsigned comparisons.  It seems that currently reg_set_min_max
> > [_inv] assumes that any given register-value will either only be used as
> > signed, or only be used as unsigned — which while potentially reasonable
> > for compiler-generated bytecode, could easily be untrue of a hand-crafted
> > BPF program.
> >For instance, take BPF_JGT(reg, val).  This currently sets
> > false_reg->min_value to zero, but if val >= (1<<63), the false branch could
> > be taken for a value that's negative (when interpreted as signed).
> 
> I think the way Josef intended it to behave is min/max_value are
> absolute values that 64-bits can hold.
> In that sense unsigned (JGT) comparison and the false branch are
> implying that min_value = 0.
> but if we don't treat min/max consistently as sign-free numbers
> than indeed it can cause issues.
> Do you have an asm test case that demonstrates that?
>

Well the min_value is a s64, but yeah anything negative is supposed to be
rejected, so it essentially acts as the range of unsigned absolute values it can
hold.  I tried to hand craft a way to exploit this but I don't think it's
possible.  In the normal BPF_JGT path with your case we'd end up with

false_reg->min_value = 0;
false_reg->max_value = 1<<63 = BPF_REGISTER_MAX_RANGE
true_reg->min_value = BPF_REGISTER_MIN_RANGE

>From here we want to exploit the fact that false_reg->min_value is not
necessarily correct, but in order to do that we need to get false_reg->max_value
below the actual size limit for the data we're reaching into, which means we
want to _only_ change false_reg->max_value.  Thankfully there doesn't appear to
be a way to do that, everything changes either only min_value or both min_value
and max_value.  I think we're safe here, unless I've missed something.  Thanks,

Josef

[PATCH net-next][v2] bpf: test for AND edge cases

2017-02-03 Thread Josef Bacik

These two tests are based on the work done for f23cc643f9ba.  The first test is
just a basic one to make sure we don't allow AND'ing negative values, even if it
would result in a valid index for the array.  The second is a cleaned up version
of the original testcase provided by Jann Horn that resulted in the commit.

Acked-by: Alexei Starovoitov <a...@kernel.org>
Acked-by: Daniel Borkmann <dan...@iogearbox.net>
Signed-off-by: Josef Bacik <jba...@fb.com>
---
v1->v2:
-rebased onto net-next

 tools/testing/selftests/bpf/test_verifier.c | 55 +
 1 file changed, 55 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 0d0912c..df194e1 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -4370,6 +4370,61 @@ static struct bpf_test tests[] = {
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
+   {
+   "invalid and of negative number",
+   .insns = {
+   BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+   BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+   BPF_LD_MAP_FD(BPF_REG_1, 0),
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+BPF_FUNC_map_lookup_elem),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+   BPF_MOV64_IMM(BPF_REG_1, 6),
+   BPF_ALU64_IMM(BPF_AND, BPF_REG_1, -4),
+   BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+   BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+   BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+  offsetof(struct test_val, foo)),
+   BPF_EXIT_INSN(),
+   },
+   .fixup_map2 = { 3 },
+   .errstr_unpriv = "R0 pointer arithmetic prohibited",
+   .errstr = "R0 min value is negative, either use unsigned index 
or do a if (index >=0) check.",
+   .result = REJECT,
+   .result_unpriv = REJECT,
+   },
+   {
+   "invalid range check",
+   .insns = {
+   BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+   BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+   BPF_LD_MAP_FD(BPF_REG_1, 0),
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+BPF_FUNC_map_lookup_elem),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 12),
+   BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+   BPF_MOV64_IMM(BPF_REG_9, 1),
+   BPF_ALU32_IMM(BPF_MOD, BPF_REG_1, 2),
+   BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1),
+   BPF_ALU32_REG(BPF_AND, BPF_REG_9, BPF_REG_1),
+   BPF_ALU32_IMM(BPF_ADD, BPF_REG_9, 1),
+   BPF_ALU32_IMM(BPF_RSH, BPF_REG_9, 1),
+   BPF_MOV32_IMM(BPF_REG_3, 1),
+   BPF_ALU32_REG(BPF_SUB, BPF_REG_3, BPF_REG_9),
+   BPF_ALU32_IMM(BPF_MUL, BPF_REG_3, 0x1000),
+   BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+   BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+   BPF_MOV64_REG(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .fixup_map2 = { 3 },
+   .errstr_unpriv = "R0 pointer arithmetic prohibited",
+   .errstr = "R0 min value is negative, either use unsigned index 
or do a if (index >=0) check.",
+   .result = REJECT,
+   .result_unpriv = REJECT,
+   }
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.7.4

Re: [PATCH net-next] bpf: test for AND edge cases

2017-02-03 Thread Josef Bacik

On Fri, 2017-02-03 at 16:03 -0500, David Miller wrote:
> From: Josef Bacik <jba...@fb.com>
> Date: Thu, 2 Feb 2017 12:00:38 -0500
> 
> > 
> > These two tests are based on the work done for f23cc643f9ba.  The
> > first test is
> > just a basic one to make sure we don't allow AND'ing negative
> > values, even if it
> > would result in a valid index for the array.  The second is a
> > cleaned up version
> > of the original testcase provided by Jann Horn that resulted in the
> > commit.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> This doesn't apply cleanly to net-next, please respin.

Ugh sorry did it on the wrong branch, I'll send an updated one shortly.
 Thanks,

Josef

Re: [PATCH net-next] loopback: clear pfmemalloc on outgoing skb's

2017-02-02 Thread Josef Bacik

On Thu, 2017-02-02 at 09:06 -0800, Eric Dumazet wrote:
> On Thu, 2017-02-02 at 10:56 -0500, Josef Bacik wrote:
> 
> > 
> > The problem is we set skb->pfmemalloc a bunch of different places,
> > such
> > as __skb_fill_page_desc, which appears to be used in both the RX
> > and TX
> > path, so we can't just kill it there.  Do we want to go through and
> > audit each one, provide a way for callers to indicate if we care
> > about
> > pfmemalloc and solve this problem that way?  I feel like that's
> > more
> > likely to bite us in the ass down the line, and somebody who
> > doesn't
> > know the context is going to come along and change it and regress
> > us to
> > the current situation.  The only place this is a problem is with
> > loopback, and my change is contained to this one weird
> > case.  Thanks,
> I mentioned this in another mail :
> 
> Same issue will happen with veth, or any kind of driver allowing skb
> being given back to the stack in RX.
> 
> So your patch on loopback is not the definitive patch.
> 
> We probably should clear pf->memalloc directly in TCP write function.
> 
> Note that I clear it on the clone, not in original skb.
> 
> (It might be very useful to keep skb->pfmemalloc on original skbs in
> write queue, at least for debugging purposes)
> 
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index
> 8ce50dc3ab8cac821b8a2c3e0d31f0aa42f5c9d5..010280f1592d3bd195315882c36
> 4bdbbd4a1c2ec 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -944,6 +944,7 @@ static int tcp_transmit_skb(struct sock *sk,
> struct sk_buff *skb, int clone_it,
> skb = skb_clone(skb, gfp_mask);
> if (unlikely(!skb))
>     return -ENOBUFS;
> +   skb->pfmemalloc = 0;
> }
>  
> inet = inet_sk(sk);
> 
> 

Yup this fixes my problem, you can add

Acked-by: Josef Bacik <jba...@fb.com>

when you send it.  Thanks,

Josef

[PATCH net-next] bpf: test for AND edge cases

2017-02-02 Thread Josef Bacik

These two tests are based on the work done for f23cc643f9ba.  The first test is
just a basic one to make sure we don't allow AND'ing negative values, even if it
would result in a valid index for the array.  The second is a cleaned up version
of the original testcase provided by Jann Horn that resulted in the commit.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 55 +
 1 file changed, 55 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 853d7e4..44404f1 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2905,6 +2905,61 @@ static struct bpf_test tests[] = {
.result = REJECT,
.errstr = "invalid bpf_context access",
},
+   {
+   "invalid and of negative number",
+   .insns = {
+   BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+   BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+   BPF_LD_MAP_FD(BPF_REG_1, 0),
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+BPF_FUNC_map_lookup_elem),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+   BPF_MOV64_IMM(BPF_REG_1, 6),
+   BPF_ALU64_IMM(BPF_AND, BPF_REG_1, -4),
+   BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+   BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+   BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+  offsetof(struct test_val, foo)),
+   BPF_EXIT_INSN(),
+   },
+   .fixup_map2 = { 3 },
+   .errstr_unpriv = "R0 pointer arithmetic prohibited",
+   .errstr = "R0 min value is negative, either use unsigned index 
or do a if (index >=0) check.",
+   .result = REJECT,
+   .result_unpriv = REJECT,
+   },
+   {
+   "invalid range check",
+   .insns = {
+   BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+   BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+   BPF_LD_MAP_FD(BPF_REG_1, 0),
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+BPF_FUNC_map_lookup_elem),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 12),
+   BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+   BPF_MOV64_IMM(BPF_REG_9, 1),
+   BPF_ALU32_IMM(BPF_MOD, BPF_REG_1, 2),
+   BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1),
+   BPF_ALU32_REG(BPF_AND, BPF_REG_9, BPF_REG_1),
+   BPF_ALU32_IMM(BPF_ADD, BPF_REG_9, 1),
+   BPF_ALU32_IMM(BPF_RSH, BPF_REG_9, 1),
+   BPF_MOV32_IMM(BPF_REG_3, 1),
+   BPF_ALU32_REG(BPF_SUB, BPF_REG_3, BPF_REG_9),
+   BPF_ALU32_IMM(BPF_MUL, BPF_REG_3, 0x1000),
+   BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+   BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+   BPF_MOV64_REG(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .fixup_map2 = { 3 },
+   .errstr_unpriv = "R0 pointer arithmetic prohibited",
+   .errstr = "R0 min value is negative, either use unsigned index 
or do a if (index >=0) check.",
+   .result = REJECT,
+   .result_unpriv = REJECT,
+   }
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.7.4

Re: [PATCH net-next] loopback: clear pfmemalloc on outgoing skb's

2017-02-02 Thread Josef Bacik

On Wed, 2017-02-01 at 15:38 -0800, Eric Dumazet wrote:
> On Wed, 2017-02-01 at 16:04 -0500, Josef Bacik wrote:
> > 
> > I was seeing random disconnects while testing NBD over
> > loopback.  This turned
> > out to be because NBD sets pfmemalloc on it's socket, however the
> > receiving side
> > is a user space application so does not have pfmemalloc set on its
> > socket.  This
> > means that sk_filter_trim_cap will simply drop this packet, under
> > the assumption
> > that the other side will simply retransmit.  Well we do retransmit,
> > and then the
> > packet is just dropped again for the same reason.  To keep this
> > from happening
> > simply clear skb->pfmemalloc on transmit so that we don't drop the
> > packet on the
> > receive side.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> > ---
> >  drivers/net/loopback.c | 7 +++
> >  1 file changed, 7 insertions(+)
> > 
> > diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
> > index 1e05b7c..13c9126 100644
> > --- a/drivers/net/loopback.c
> > +++ b/drivers/net/loopback.c
> > @@ -81,6 +81,13 @@ static netdev_tx_t loopback_xmit(struct sk_buff
> > *skb,
> >      */
> >     skb_dst_force(skb);
> >  
> > +   /* If our transmitter was a pfmemalloc socket we need to
> > clear
> > +    * pfmemalloc here, otherwise the receiving socket may not
> > be
> > +    * pfmemalloc, and if this is a tcp packet then it'll get
> > dropped and
> > +    * all traffic will halt.
> > +    */
> > +   skb->pfmemalloc = false;
> > +
> I am not sure this is a proper fix.
> 
> Presumably if the socket was able to store packets in its write
> queue,
> fact that it sends it to loopback or an Ethernet link should not
> matter.
> 
> Only in RX path the pfmemalloc thing is really important.
> 
> So I would rather not set skb->pfmemalloc for skbs allocated for the
> write queue, and more exactly the fast clone.
> 
> This would actually speed up the stack a bit.

The problem is we set skb->pfmemalloc a bunch of different places, such
as __skb_fill_page_desc, which appears to be used in both the RX and TX
path, so we can't just kill it there.  Do we want to go through and
audit each one, provide a way for callers to indicate if we care about
pfmemalloc and solve this problem that way?  I feel like that's more
likely to bite us in the ass down the line, and somebody who doesn't
know the context is going to come along and change it and regress us to
the current situation.  The only place this is a problem is with
loopback, and my change is contained to this one weird case.  Thanks,

Josef

Re: [PATCH net-next] net: add LINUX_MIB_PFMEMALLOCDROP counter

2017-02-02 Thread Josef Bacik

On Wed, 2017-02-01 at 20:47 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <eduma...@google.com>
> 
> Debugging issues caused by pfmemalloc is often tedious.
> 
> Add a new SNMP counter to more easily diagnose these problems.
> 
> Signed-off-by: Eric Dumazet <eduma...@google.com>
> Cc: Josef Bacik <jba...@fb.com>

Acked-by: Josef Bacik <jba...@fb.com>

Thanks Eric,

Josef

[PATCH net-next] loopback: clear pfmemalloc on outgoing skb's

2017-02-01 Thread Josef Bacik

I was seeing random disconnects while testing NBD over loopback.  This turned
out to be because NBD sets pfmemalloc on it's socket, however the receiving side
is a user space application so does not have pfmemalloc set on its socket.  This
means that sk_filter_trim_cap will simply drop this packet, under the assumption
that the other side will simply retransmit.  Well we do retransmit, and then the
packet is just dropped again for the same reason.  To keep this from happening
simply clear skb->pfmemalloc on transmit so that we don't drop the packet on the
receive side.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 drivers/net/loopback.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 1e05b7c..13c9126 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -81,6 +81,13 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 */
skb_dst_force(skb);
 
+   /* If our transmitter was a pfmemalloc socket we need to clear
+* pfmemalloc here, otherwise the receiving socket may not be
+* pfmemalloc, and if this is a tcp packet then it'll get dropped and
+* all traffic will halt.
+*/
+   skb->pfmemalloc = false;
+
skb->protocol = eth_type_trans(skb, dev);
 
/* it's OK to use per_cpu_ptr() because BHs are off */
-- 
2.7.4

Re: TCP stops sending packets over loopback on 4.10-rc3?

2017-01-26 Thread Josef Bacik

On Wed, 2017-01-25 at 06:39 -0800, Eric Dumazet wrote:
> On Wed, 2017-01-25 at 09:26 -0500, Josef Bacik wrote:
> 
> > 
> > Nope ftrace isn't broken, I'm just dumb, the space is being
> > reclaimed 
> > by sk_wmem_free_skb().  So I guess I need to figure out why I stop 
> > getting ACK's from the other side of the loopback.  Thanks,
> ss -temoi dst 127.0.0.1
> 
> Might give you some hints, like packets being dropped.
> 
> ACK can be delayed if the reader is slow to consume bytes.
> 

Yup looks like I'm getting packet loss for some reason, but the
application is sitting there in recvmsg, so it's not hung and
definitely available for receiving new packets.

ESTAB  0  4124232  
  ::1:34044
   
::1:nbd   t
imer:(on,1min38sec,9) ino:20067 sk:8 <->
 skmem:(r0,rb6291456,t0,tb4194304,f1720,w4204872,o0,bl0) ts
sack cubic wscale:7,7 rto:102912 backoff:9 rtt:0.084/0.038 ato:40
mss:65464 cwnd:1 ssthresh:18 bytes_acked:71964077253
bytes_received:68804409996 segs_out:3882829 segs_in:4092731 send
6234.7Mbps lastsnd:4336 lastrcv:111289 lastack:111299 unacked:28
retrans:1/4277 lost:28 reordering:60 rcv_rtt:1.875 rcv_space:1315136

ESTAB  0  0
 ::1:nbd   
   
   ::1:34044 ti
mer:(keepalive,109min,0) ino:19396 sk:2 <->
 skmem:(r0,rb6291456,t0,tb2626560,f0,w0,o0,bl0) ts sack cubic
wscale:7,7 rto:201 rtt:0.279/0.16 ato:40 mss:65464 cwnd:16 ssthresh:9
bytes_acked:68804409996 bytes_received:71964077252 segs_out:4092730
segs_in:3882792 send 30033.7Mbps lastsnd:111286 lastrcv:111307
lastack:111286 retrans:0/3113 reordering:26 rcv_rtt:1 rcv_space:4782816

I traced tcp_enter_loss() and once things stop moving that starts
firing.  That's all I have so far, been busy with other things but I'm
devoting my full attention to this now.  Thanks,

Josef

Re: TCP stops sending packets over loopback on 4.10-rc3?

2017-01-25 Thread Josef Bacik


On Wed, Jan 25, 2017 at 9:14 AM, Josef Bacik <jba...@fb.com> wrote:
On Tue, Jan 24, 2017 at 9:07 AM, Eric Dumazet 
<eric.duma...@gmail.com> wrote:

On Tue, 2017-01-24 at 06:20 -0500, Josef Bacik wrote:

 Hello,

 I've been trying to test some NBD changes I had made recently and I
 started having packet timeouts.  I traced this down to tcp just
 stopping sending packets after a lot of writing.  All NBD does is 
call

 kernel_sendmsg() with a request struct and some pages when it does
 writes.  I did a bunch of tracing and I've narrowed it down to 
running

 out of sk_wmem_queued space.  In tcp_sendmsg() here

 new_segment:
 /* Allocate new segment. If the interface 
is SG,

  * allocate skb fitting to single page.
  */
 if (!sk_stream_memory_free(sk))
 goto wait_for_sndbuf;

 we hit this pretty regularly, and eventually just get stuck in
 sk_stream_wait_memory until the timeout ends and we error out
 everything.  Now sk_stream_memory_free checks the sk_wmem_queued 
and
 calls into the sk_prot->stream_memory_free(), so I broke this out 
like

 the following


 if (sk->sk_wmem_queued >= sk->sk_sndbuf) {
 trace_printk("sk_wmem_queued %d, sk_sndbuf %d\n",
 sk->sk_wmem_queued, sk->sk_sndbuf);
 goto wait_for_sndbuf;
  }
  if (sk->sk_prot->stream_memory_free &&
 !sk->sk_prot->stream_memory_free(sk)) {
 trace_printk("sk_stream_memory_free\n");
 goto wait_for_sndbuf;
  }

 And I got this in my tracing

kworker/u16:5-112   [001]   1375.637564: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1375.639657: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1375.641128: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1375.643441: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1375.807614: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1377.538744: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1377.543418: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
 kworker/2:4H-1535  [002]   1377.544685: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [000]   1379.378352: tcp_sendmsg:
 sk_wmem_queued 4205796, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1380.985721: tcp_sendmsg:
 sk_wmem_queued 4212416, sk_sndbuf 4194304

 This is as far as I've gotten and I'll keep digging into it, but I 
was

 wondering if this looks familiar to anybody?  Also one thing I've
 noticed is sk_stream_wait_memory() will wait on sk_sleep(sk), but
 basically nothing wakes this up.  For example it seems the main 
way we

 reduce sk_wmem_queued is through sk_wmem_free_skb(), which doesn't
 appear to wake anything up in any of its callers, so anybody who 
does

 end up sleeping will basically never wake up.  That seems like it
 should be more broken than it is, so I'm curious to know how 
things are

 actually woken up in this case.  Thanks,



git grep -n SOCK_QUEUE_SHRUNK

-> tcp_check_space()


But tcp_check_space() doesn't actually reduce sk_wmem_queued from 
what I can see.  The only places that appear to reduce it are 
tcp_trim_head, which is only called in the retransmit path, and 
sk_wmem_free_skb, which seems to be right, but I added a 
trace_printk() in it to see if it was firing during my test and it 
never fires.  So we _appear_ to only ever be incrementing this 
counter, but never decrementing it.  I'm doing a bunch of tracing 
trying to figure out what is going on here but so far nothing is 
popping which is starting to make me think ftrace is broken.  Thanks,


Nope ftrace isn't broken, I'm just dumb, the space is being reclaimed 
by sk_wmem_free_skb().  So I guess I need to figure out why I stop 
getting ACK's from the other side of the loopback.  Thanks,


Josef

Re: TCP stops sending packets over loopback on 4.10-rc3?

2017-01-25 Thread Josef Bacik

On Tue, Jan 24, 2017 at 9:07 AM, Eric Dumazet <eric.duma...@gmail.com> 
wrote:

On Tue, 2017-01-24 at 06:20 -0500, Josef Bacik wrote:

 Hello,

 I've been trying to test some NBD changes I had made recently and I
 started having packet timeouts.  I traced this down to tcp just
 stopping sending packets after a lot of writing.  All NBD does is 
call

 kernel_sendmsg() with a request struct and some pages when it does
 writes.  I did a bunch of tracing and I've narrowed it down to 
running

 out of sk_wmem_queued space.  In tcp_sendmsg() here

 new_segment:
 /* Allocate new segment. If the interface 
is SG,

  * allocate skb fitting to single page.
  */
 if (!sk_stream_memory_free(sk))
 goto wait_for_sndbuf;

 we hit this pretty regularly, and eventually just get stuck in
 sk_stream_wait_memory until the timeout ends and we error out
 everything.  Now sk_stream_memory_free checks the sk_wmem_queued and
 calls into the sk_prot->stream_memory_free(), so I broke this out 
like

 the following


 if (sk->sk_wmem_queued >= sk->sk_sndbuf) {
 trace_printk("sk_wmem_queued %d, sk_sndbuf %d\n",
 sk->sk_wmem_queued, sk->sk_sndbuf);
 goto wait_for_sndbuf;
  }
  if (sk->sk_prot->stream_memory_free &&
 !sk->sk_prot->stream_memory_free(sk)) {
 trace_printk("sk_stream_memory_free\n");
 goto wait_for_sndbuf;
  }

 And I got this in my tracing

kworker/u16:5-112   [001]   1375.637564: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1375.639657: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1375.641128: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1375.643441: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1375.807614: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1377.538744: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [001]   1377.543418: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
 kworker/2:4H-1535  [002]   1377.544685: tcp_sendmsg:
 sk_wmem_queued 4204872, sk_sndbuf 4194304
kworker/u16:5-112   [000]   1379.378352: tcp_sendmsg:
 sk_wmem_queued 4205796, sk_sndbuf 4194304
kworker/u16:5-112   [003]   1380.985721: tcp_sendmsg:
 sk_wmem_queued 4212416, sk_sndbuf 4194304

 This is as far as I've gotten and I'll keep digging into it, but I 
was

 wondering if this looks familiar to anybody?  Also one thing I've
 noticed is sk_stream_wait_memory() will wait on sk_sleep(sk), but
 basically nothing wakes this up.  For example it seems the main way 
we

 reduce sk_wmem_queued is through sk_wmem_free_skb(), which doesn't
 appear to wake anything up in any of its callers, so anybody who 
does

 end up sleeping will basically never wake up.  That seems like it
 should be more broken than it is, so I'm curious to know how things 
are

 actually woken up in this case.  Thanks,



git grep -n SOCK_QUEUE_SHRUNK

-> tcp_check_space()


But tcp_check_space() doesn't actually reduce sk_wmem_queued from what 
I can see.  The only places that appear to reduce it are tcp_trim_head, 
which is only called in the retransmit path, and sk_wmem_free_skb, 
which seems to be right, but I added a trace_printk() in it to see if 
it was firing during my test and it never fires.  So we _appear_ to 
only ever be incrementing this counter, but never decrementing it.  I'm 
doing a bunch of tracing trying to figure out what is going on here but 
so far nothing is popping which is starting to make me think ftrace is 
broken.  Thanks,


Josef

TCP stops sending packets over loopback on 4.10-rc3?

2017-01-24 Thread Josef Bacik


Hello,

I've been trying to test some NBD changes I had made recently and I 
started having packet timeouts.  I traced this down to tcp just 
stopping sending packets after a lot of writing.  All NBD does is call 
kernel_sendmsg() with a request struct and some pages when it does 
writes.  I did a bunch of tracing and I've narrowed it down to running 
out of sk_wmem_queued space.  In tcp_sendmsg() here


new_segment:
   /* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
   if (!sk_stream_memory_free(sk))
   goto wait_for_sndbuf;

we hit this pretty regularly, and eventually just get stuck in 
sk_stream_wait_memory until the timeout ends and we error out 
everything.  Now sk_stream_memory_free checks the sk_wmem_queued and 
calls into the sk_prot->stream_memory_free(), so I broke this out like 
the following



   if (sk->sk_wmem_queued >= sk->sk_sndbuf) {
   trace_printk("sk_wmem_queued %d, sk_sndbuf %d\n", 
sk->sk_wmem_queued, sk->sk_sndbuf);

   goto wait_for_sndbuf;
}
if (sk->sk_prot->stream_memory_free && 
!sk->sk_prot->stream_memory_free(sk)) {

   trace_printk("sk_stream_memory_free\n");
   goto wait_for_sndbuf;
}

And I got this in my tracing

  kworker/u16:5-112   [001]   1375.637564: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [001]   1375.639657: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [003]   1375.641128: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [003]   1375.643441: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [001]   1375.807614: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [001]   1377.538744: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [001]   1377.543418: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
   kworker/2:4H-1535  [002]   1377.544685: tcp_sendmsg: 
sk_wmem_queued 4204872, sk_sndbuf 4194304
  kworker/u16:5-112   [000]   1379.378352: tcp_sendmsg: 
sk_wmem_queued 4205796, sk_sndbuf 4194304
  kworker/u16:5-112   [003]   1380.985721: tcp_sendmsg: 
sk_wmem_queued 4212416, sk_sndbuf 4194304


This is as far as I've gotten and I'll keep digging into it, but I was 
wondering if this looks familiar to anybody?  Also one thing I've 
noticed is sk_stream_wait_memory() will wait on sk_sleep(sk), but 
basically nothing wakes this up.  For example it seems the main way we 
reduce sk_wmem_queued is through sk_wmem_free_skb(), which doesn't 
appear to wake anything up in any of its callers, so anybody who does 
end up sleeping will basically never wake up.  That seems like it 
should be more broken than it is, so I'm curious to know how things are 
actually woken up in this case.  Thanks,


Josef

Re: [PATCH] inet: don't use sk_v6_rcv_saddr directly

2017-01-20 Thread Josef Bacik


On Thu, Jan 19, 2017 at 5:47 PM, Josef Bacik <jba...@fb.com> wrote:
When comparing two sockets we need to use inet6_rcv_saddr so we get a 
NULL
sk_v6_rcv_saddr if the socket isn't AF_INET6, otherwise our 
comparison function

can be wrong.

Fixes: 637bc8b ("inet: reset tb->fastreuseport when adding a 
reuseport sk")

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c 
b/net/ipv4/inet_connection_sock.c

index 096a085..a336c42 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -99,7 +99,7 @@ int inet_rcv_saddr_equal(const struct sock *sk, 
const struct sock *sk2,

 #if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(>sk_v6_rcv_saddr,
-   >sk_v6_rcv_saddr,
+   inet6_rcv_saddr(sk2),
sk->sk_rcv_saddr,
sk2->sk_rcv_saddr,
ipv6_only_sock(sk),
--
2.5.5


Sorry I forgot to tag this, it's for net-next.  Thanks,

Josef

[PATCH] inet: don't use sk_v6_rcv_saddr directly

2017-01-19 Thread Josef Bacik

When comparing two sockets we need to use inet6_rcv_saddr so we get a NULL
sk_v6_rcv_saddr if the socket isn't AF_INET6, otherwise our comparison function
can be wrong.

Fixes: 637bc8b ("inet: reset tb->fastreuseport when adding a reuseport sk")
Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 096a085..a336c42 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -99,7 +99,7 @@ int inet_rcv_saddr_equal(const struct sock *sk, const struct 
sock *sk2,
 #if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(>sk_v6_rcv_saddr,
-   >sk_v6_rcv_saddr,
+   inet6_rcv_saddr(sk2),
sk->sk_rcv_saddr,
sk2->sk_rcv_saddr,
ipv6_only_sock(sk),
-- 
2.5.5

[PATCH 4/6 net-next] inet: don't check for bind conflicts twice when searching for a port

2017-01-17 Thread Josef Bacik

This is just wasted time, we've already found a tb that doesn't have a bind
conflict, and we don't drop the head lock so scanning again isn't going to give
us a different answer.  Instead move the tb->reuse setting logic outside of the
found_tb path and put it in the success: path.  Then make it so that we don't
goto again if we find a bind conflict in the found_tb path as we won't reach
this anymore when we are scanning for an ephemeral port.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 31 +++
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d352366..f7e844d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -164,7 +164,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-   int ret = 1, attempts = 5, port = snum;
+   int ret = 1, port = snum;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
@@ -183,7 +183,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 
goto tb_not_found;
}
-again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
inet_get_local_port_range(net, , );
@@ -221,7 +220,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
inet_bind_bucket_for_each(tb, >chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, 
reuseport_ok))
-   goto tb_found;
+   goto success;
goto next_port;
}
goto tb_not_found;
@@ -256,23 +255,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short 
snum)
  !rcu_access_pointer(sk->sk_reuseport_cb) &&
  sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
goto success;
-   if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
-   if ((reuse ||
-(tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) && !snum &&
-   --attempts >= 0) {
-   spin_unlock_bh(>lock);
-   goto again;
-   }
+   if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
goto fail_unlock;
-   }
-   if (!reuse)
-   tb->fastreuse = 0;
-   if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
-   tb->fastreuseport = 0;
-   } else {
+   }
+success:
+   if (!hlist_empty(>owners)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
@@ -280,8 +267,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
} else {
tb->fastreuseport = 0;
}
+   } else {
+   if (!reuse)
+   tb->fastreuse = 0;
+   if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+   tb->fastreuseport = 0;
}
-success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
-- 
2.9.3

[PATCH 6/6 net-next] inet: reset tb->fastreuseport when adding a reuseport sk

2017-01-17 Thread Josef Bacik

If we have non reuseport sockets on a tb we will set tb->fastreuseport to 0 and
never set it again.  Which means that in the future if we end up adding a bunch
of reuseport sk's to that tb we'll have to do the expensive scan every time.
Instead add the ipv4/ipv6 saddr fields to the bind bucket, as well as the family
so we know what comparison to make, and the ipv6 only setting so we can make
sure to compare with new sockets appropriately.  Once one sk has made it onto
the list we know that there are no potential bind conflicts on the owners list
that match that sk's rcv_addr.  So copy the sk's information into our bind
bucket and set tb->fastruseport to FASTREUSESOCK_STRICT so we know we have to do
an extra check for subsequent reuseport sockets and skip the expensive bind
conflict check.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/inet_hashtables.h   |   9 
 net/ipv4/inet_connection_sock.c | 106 
 2 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 3fc0366..1178931 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -74,12 +74,21 @@ struct inet_ehash_bucket {
  * users logged onto your box, isn't it nice to know that new data
  * ports are created in O(1) time?  I thought so. ;-)  -DaveM
  */
+#define FASTREUSEPORT_ANY  1
+#define FASTREUSEPORT_STRICT   2
+
 struct inet_bind_bucket {
possible_net_t  ib_net;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
kuid_t  fastuid;
+#if IS_ENABLED(CONFIG_IPV6)
+   struct in6_addr fast_v6_rcv_saddr;
+#endif
+   __be32  fast_rcv_saddr;
+   unsigned short  fast_sk_family;
+   boolfast_ipv6_only;
struct hlist_node   node;
struct hlist_head   owners;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bbe2892..096a085 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -38,20 +38,21 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  *  IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
  *  and 0.0.0.0 equals to 0.0.0.0 only
  */
-static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+   const struct in6_addr *sk2_rcv_saddr6,
+   __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+   bool sk1_ipv6only, bool sk2_ipv6only,
bool match_wildcard)
 {
-   const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-   int sk2_ipv6only = inet_v6_ipv6only(sk2);
-   int addr_type = ipv6_addr_type(>sk_v6_rcv_saddr);
+   int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : 
IPV6_ADDR_MAPPED;
 
/* if both are mapped, treat as IPv4 */
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
if (!sk2_ipv6only) {
-   if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+   if (sk1_rcv_saddr == sk2_rcv_saddr)
return 1;
-   if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+   if (!sk1_rcv_saddr || !sk2_rcv_saddr)
return match_wildcard;
}
return 0;
@@ -65,11 +66,11 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, 
const struct sock *sk2,
return 1;
 
if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-   !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+   !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
return 1;
 
if (sk2_rcv_saddr6 &&
-   ipv6_addr_equal(>sk_v6_rcv_saddr, sk2_rcv_saddr6))
+   ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
return 1;
 
return 0;
@@ -80,13 +81,13 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, 
const struct sock *sk2,
  * match_wildcard == false: addresses must be exactly the same, i.e.
  *  0.0.0.0 only equals to 0.0.0.0
  */
-static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-   bool match_wildcard)
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+   bool sk2_ipv6only, bool match_wildcard)
 {
-   if (!ipv6_only_sock(sk2)) {
-   if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+   if (!sk2_ipv6only) {

[PATCH 3/6 net-next] inet: kill smallest_size and smallest_port

2017-01-17 Thread Josef Bacik

In inet_csk_get_port we seem to be using smallest_port to figure out where the
best place to look for a SO_REUSEPORT sk that matches with an existing set of
SO_REUSEPORT's.  However if we get to the logic

if (smallest_size != -1) {
port = smallest_port;
goto have_port;
}

we will do a useless search, because we would have already done the
inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
would have gone to found_tb and succeeded.  Since this logic makes us do yet
another trip through inet_csk_bind_conflict for a port we know won't work just
delete this code and save us the time.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/inet_hashtables.h   |  1 -
 net/ipv4/inet_connection_sock.c | 26 --
 net/ipv4/inet_hashtables.c  |  3 ---
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 756ed16..3fc0366 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -80,7 +80,6 @@ struct inet_bind_bucket {
signed char fastreuse;
signed char fastreuseport;
kuid_t  fastuid;
-   int num_owners;
struct hlist_node   node;
struct hlist_head   owners;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a1c9055..d352366 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -165,7 +165,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, attempts = 5, port = snum;
-   int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
@@ -175,7 +174,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
bool reuseport_ok = !!snum;
 
if (port) {
-have_port:
head = >bhash[inet_bhashfn(net, port,
  hinfo->bhash_size)];
spin_lock_bh(>lock);
@@ -209,8 +207,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 * We do the opposite to not pollute connect() users.
 */
offset |= 1U;
-   smallest_size = -1;
-   smallest_port = low; /* avoid compiler warning */
 
 other_parity_scan:
port = low + offset;
@@ -224,15 +220,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
spin_lock_bh(>lock);
inet_bind_bucket_for_each(tb, >chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
-   if (((tb->fastreuse > 0 && reuse) ||
-(tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) 
&&
- uid_eq(tb->fastuid, uid))) &&
-   (tb->num_owners < smallest_size || 
smallest_size == -1)) {
-   smallest_size = tb->num_owners;
-   smallest_port = port;
-   }
if (!inet_csk_bind_conflict(sk, tb, false, 
reuseport_ok))
goto tb_found;
goto next_port;
@@ -243,10 +230,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
cond_resched();
}
 
-   if (smallest_size != -1) {
-   port = smallest_port;
-   goto have_port;
-   }
offset--;
if (!(offset & 1))
goto other_parity_scan;
@@ -268,19 +251,18 @@ int inet_csk_get_port(struct sock *sk, unsigned short 
snum)
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
 
-   if (((tb->fastreuse > 0 && reuse) ||
+   if ((tb->fastreuse > 0 && reuse) ||
 (tb->fastreuseport > 0 &&
  !rcu_access_pointer(sk->sk_reuseport_cb) &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-   smallest_size == -1)
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
goto success;
if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
if ((reuse ||
 (tb->fastreuseport > 0 &&

[PATCH 5/6 net-next] inet: split inet_csk_get_port into two functions

2017-01-17 Thread Josef Bacik

inet_csk_get_port does two different things, it either scans for an open port,
or it tries to see if the specified port is available for use.  Since these two
operations have different rules and are basically independent lets split them
into two different functions to make them both more readable.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 66 +++--
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f7e844d..bbe2892 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -156,33 +156,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
return sk2 != NULL;
 }
 
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket lock held.
  */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int 
*port_ret)
 {
-   bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-   int ret = 1, port = snum;
+   int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
-   kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
-   bool reuseport_ok = !!snum;
 
-   if (port) {
-   head = >bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
-   spin_lock_bh(>lock);
-   inet_bind_bucket_for_each(tb, >chain)
-   if (net_eq(ib_net(tb), net) && tb->port == port)
-   goto tb_found;
-
-   goto tb_not_found;
-   }
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
inet_get_local_port_range(net, , );
@@ -219,11 +207,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short 
snum)
spin_lock_bh(>lock);
inet_bind_bucket_for_each(tb, >chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
-   if (!inet_csk_bind_conflict(sk, tb, false, 
reuseport_ok))
+   if (!inet_csk_bind_conflict(sk, tb, false, 
false))
goto success;
goto next_port;
}
-   goto tb_not_found;
+   tb = NULL;
+   goto success;
 next_port:
spin_unlock_bh(>lock);
cond_resched();
@@ -238,8 +227,41 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
attempt_half = 2;
goto other_half_scan;
}
-   return ret;
+   return NULL;
+success:
+   *port_ret = port;
+   *tb_ret = tb;
+   return head;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+   bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+   struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+   int ret = 1, port = snum;
+   struct inet_bind_hashbucket *head;
+   struct net *net = sock_net(sk);
+   struct inet_bind_bucket *tb = NULL;
+   kuid_t uid = sock_i_uid(sk);
 
+   if (!port) {
+   head = inet_csk_find_open_port(sk, , );
+   if (!head)
+   return ret;
+   if (!tb)
+   goto tb_not_found;
+   goto success;
+   }
+   head = >bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+   spin_lock_bh(>lock);
+   inet_bind_bucket_for_each(tb, >chain)
+   if (net_eq(ib_net(tb), net) && tb->port == port)
+   goto tb_found;
 tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 net, head, port);
@@ -255,7 +277,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
  !rcu_access_pointer(sk->sk_reuseport_cb) &&
  sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
goto success;
-   if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
+   if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
}
 success:
-- 
2.9.3

[PATCH 2/6 net-next] inet: drop ->bind_conflict

2017-01-17 Thread Josef Bacik

The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict
is how they check the rcv_saddr, so delete this call back and simply
change inet_csk_bind_conflict to call inet_rcv_saddr_equal.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/inet6_connection_sock.h |  5 -
 include/net/inet_connection_sock.h  |  6 --
 net/dccp/ipv4.c |  1 -
 net/dccp/ipv6.c |  2 --
 net/ipv4/inet_connection_sock.c | 22 +++-
 net/ipv4/tcp_ipv4.c |  1 -
 net/ipv6/inet6_connection_sock.c| 40 -
 net/ipv6/tcp_ipv6.c |  2 --
 8 files changed, 7 insertions(+), 72 deletions(-)

diff --git a/include/net/inet6_connection_sock.h 
b/include/net/inet6_connection_sock.h
index 3212b39..8ec87b6 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,16 +15,11 @@
 
 #include 
 
-struct inet_bind_bucket;
 struct request_sock;
 struct sk_buff;
 struct sock;
 struct sockaddr;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-   const struct inet_bind_bucket *tb, bool relax,
-   bool soreuseport_ok);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 
*fl6,
  const struct request_sock *req, u8 proto);
 
diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 84b2edd..826f198 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,9 +62,6 @@ struct inet_connection_sock_af_ops {
char __user *optval, int __user *optlen);
 #endif
void(*addr2sockaddr)(struct sock *sk, struct sockaddr *);
-   int (*bind_conflict)(const struct sock *sk,
-const struct inet_bind_bucket *tb,
-bool relax, bool soreuseport_ok);
void(*mtu_reduced)(struct sock *sk);
 };
 
@@ -263,9 +260,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock 
*icsk,
 
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-  const struct inet_bind_bucket *tb, bool relax,
-  bool soreuseport_ok);
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d859a5c..b043ec8 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -904,7 +904,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv4_af_ops = {
.getsockopt= ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in),
-   .bind_conflict = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index adfc790..08bcdc3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv6_af_ops = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
-   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv6_mapped = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
-   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ba597cb..a1c9055 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -116,9 +116,9 @@ void inet_get_local_port_range(struct net *net, int *low, 
int *high)
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-  const struct inet_bind_bucket *tb, bool relax,
-  bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ bool relax, bool reuseport_ok)
 {
struct sock *sk2;
bool reuse = sk->sk_reuse;
@@ -134,7 +134,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
 
sk_for_each_bound(sk2, >owners) {
i

[PATCH 0/6 net-next][V4] Rework inet_csk_get_port

2017-01-17 Thread Josef Bacik

V3->V4:
-Removed the random include of addrconf.h that is no longer needed.

V2->V3:
-Dropped the fastsock from the tb and instead just carry the saddrs, family, and
 ipv6 only flag.
-Reworked the helper functions to deal with this change so I could still use
 them when checking the fast path.
-Killed tb->num_owners as per Eric's request.
-Attached a reproducer to the bottom of this email.

V1->V2:
-Added a new patch 'inet: collapse ipv4/v6 rcv_saddr_equal functions into one'
 at Hannes' suggestion.
-Dropped ->bind_conflict and just use the new helper.
-Fixed a compile bug from the original ->bind_conflict patch.

The original description of the series follows

=

At some point recently the guys working on our load balancer added the ability
to use SO_REUSEPORT.  When they restarted their app with this option enabled
they immediately hit a softlockup on what appeared to be the
inet_bind_bucket->lock.  Eventually what all of our debugging and discussion led
us to was the fact that the application comes up without SO_REUSEPORT, shuts
down which creates around 100k twsk's, and then comes up and tries to open a
bunch of sockets using SO_REUSEPORT, which meant traversing the inet_bind_bucket
owners list under the lock.  Since this lock is needed for dealing with the
twsk's and basically anything else related to connections we would softlockup,
and sometimes not ever recover.

To solve this problem I did what you see in Path 5/5.  Once we have a
SO_REUSEPORT socket on the tb->owners list we know that the socket has no
conflicts with any of the other sockets on that list.  So we can add a copy of
the sock_common (really all we need is the recv_saddr but it seemed ugly to copy
just the ipv6, ipv4, and flag to indicate if we were ipv6 only in there so I've
copied the whole common) in order to check subsequent SO_REUSEPORT sockets.  If
they match the previous one then we can skip the expensive
inet_csk_bind_conflict check.  This is what eliminated the soft lockup that we
were seeing.

Patches 1-4 are cleanups and re-workings.  For instance when we specify port ==
0 we need to find an open port, but we would do two passes through
inet_csk_bind_conflict every time we found a possible port.  We would also keep
track of the smallest_port value in order to try and use it if we found no
port our first run through.  This however made no sense as it would have had to
fail the first pass through inet_csk_bind_conflict, so would not actually pass
the second pass through either.  Finally I split the function into two functions
in order to make it easier to read and to distinguish between the two behaviors.

I have tested this on one of our load balancing boxes during peak traffic and it
hasn't fallen over.  But this is not my area, so obviously feel free to point
out where I'm being stupid and I'll get it fixed up and retested.  Thanks,

Josef

[PATCH 1/6 net-next] inet: collapse ipv4/v6 rcv_saddr_equal functions into one

2017-01-17 Thread Josef Bacik

We pass these per-protocol equal functions around in various places, but
we can just have one function that checks the sk->sk_family and then do
the right comparison function.  I've also changed the ipv4 version to
not cast to inet_sock since it is unneeded.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/addrconf.h   |  4 +--
 include/net/inet_hashtables.h|  5 +--
 include/net/udp.h|  1 -
 net/ipv4/inet_connection_sock.c  | 72 
 net/ipv4/inet_hashtables.c   | 16 +++--
 net/ipv4/udp.c   | 58 +++-
 net/ipv6/inet6_connection_sock.c |  4 +--
 net/ipv6/inet6_hashtables.c  | 46 +
 net/ipv6/udp.c   |  2 +-
 9 files changed, 95 insertions(+), 113 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8f998af..17c6fd8 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -88,9 +88,7 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr 
*addr,
  u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
u32 banned_flags);
-int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-bool match_wildcard);
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr 
*addr);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 0574493..756ed16 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -203,10 +203,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
-int __inet_hash(struct sock *sk, struct sock *osk,
-   int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard));
+int __inet_hash(struct sock *sk, struct sock *osk);
 int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 1661791..c9d8b8e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -204,7 +204,6 @@ static inline void udp_lib_close(struct sock *sk, long 
timeout)
 }
 
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
-int (*)(const struct sock *, const struct sock *, bool),
 unsigned int hash2_nulladdr);
 
 u32 udp_flow_hashrnd(void);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19ea045..ba597cb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,78 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown 
timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ *  only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *  IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *  and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+   bool match_wildcard)
+{
+   const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+   int sk2_ipv6only = inet_v6_ipv6only(sk2);
+   int addr_type = ipv6_addr_type(>sk_v6_rcv_saddr);
+   int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : 
IPV6_ADDR_MAPPED;
+
+   /* if both are mapped, treat as IPv4 */
+   if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+   if (!sk2_ipv6only) {
+   if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+   return 1;
+   if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+   return match_wildcard;
+   }
+   return 0;
+   }
+
+   if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+   return 1;
+
+   if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+   !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+   return 1;
+
+   if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+   !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+   return 1;
+
+   if (sk2_rcv_saddr6 &&
+

Re: [PATCH 2/6 net-next] inet: drop ->bind_conflict

2017-01-12 Thread Josef Bacik

On Thu, Jan 12, 2017 at 2:56 PM, David Miller <da...@davemloft.net> 
wrote:

From: Josef Bacik <jba...@fb.com>
Date: Wed, 11 Jan 2017 15:22:40 -0500

 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
 index 56d756e..dc07734 100644
 --- a/net/ipv4/tcp_ipv4.c
 +++ b/net/ipv4/tcp_ipv4.c
 @@ -63,6 +63,7 @@
  #include 
  #include 

 +#include 
  #include 
  #include 
  #include 

I don't see what this has to do with this change.

Ugh sorry, that's a left over from when I had the protocol specific 
callback for the saddr_equal stuff, I'll fix that up.  Thanks,

Josef

Re: [PATCH 1/6 net-next] inet: collapse ipv4/v6 rcv_saddr_equal functions into one

2017-01-12 Thread Josef Bacik

On Thu, Jan 12, 2017 at 12:41 PM, Craig Gallek <kraigatg...@gmail.com> 
wrote:

On Wed, Jan 11, 2017 at 3:19 PM, Josef Bacik <jba...@fb.com> wrote:
 +int inet_rcv_saddr_equal(const struct sock *sk, const struct sock 
*sk2,

 +bool match_wildcard)
 +{
 +#if IS_ENABLED(CONFIG_IPV6)
 +   if (sk->sk_family == AF_INET6)

Still wrapping my head around this, so take it with a grain of salt,
but it's not obvious to me that sk and sk2 are guaranteed to have the
same family here (or if it even matters).  Especially in the context
of the next patch which removes the bind_conflict callback...  Does
this need to be an OR check for the family of either socket?  Or is it
safe as-is because the first socket passed to this function is always
the existing one and the second one is the possible conflict socket?


It's safe as is, all we care is that sk1 is the INET6 socket.  In the 
compare function we use inet6_rcv_saddr(sk2) which will return NULL if 
it isn't INET6 and then the function handles that case appropriately.  
This stuff is subtle so it's easy to get confused, I always made sure 
to run it on our production boxes to make sure I didn't break something 
;).  Thanks,


Josef

[PATCH 5/6 net-next] inet: split inet_csk_get_port into two functions

2017-01-11 Thread Josef Bacik

inet_csk_get_port does two different things, it either scans for an open port,
or it tries to see if the specified port is available for use.  Since these two
operations have different rules and are basically independent lets split them
into two different functions to make them both more readable.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 net/ipv4/inet_connection_sock.c | 66 +++--
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f7e844d..bbe2892 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -156,33 +156,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
return sk2 != NULL;
 }
 
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket lock held.
  */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int 
*port_ret)
 {
-   bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-   int ret = 1, port = snum;
+   int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
-   kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
-   bool reuseport_ok = !!snum;
 
-   if (port) {
-   head = >bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
-   spin_lock_bh(>lock);
-   inet_bind_bucket_for_each(tb, >chain)
-   if (net_eq(ib_net(tb), net) && tb->port == port)
-   goto tb_found;
-
-   goto tb_not_found;
-   }
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
inet_get_local_port_range(net, , );
@@ -219,11 +207,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short 
snum)
spin_lock_bh(>lock);
inet_bind_bucket_for_each(tb, >chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
-   if (!inet_csk_bind_conflict(sk, tb, false, 
reuseport_ok))
+   if (!inet_csk_bind_conflict(sk, tb, false, 
false))
goto success;
goto next_port;
}
-   goto tb_not_found;
+   tb = NULL;
+   goto success;
 next_port:
spin_unlock_bh(>lock);
cond_resched();
@@ -238,8 +227,41 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
attempt_half = 2;
goto other_half_scan;
}
-   return ret;
+   return NULL;
+success:
+   *port_ret = port;
+   *tb_ret = tb;
+   return head;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+   bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+   struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+   int ret = 1, port = snum;
+   struct inet_bind_hashbucket *head;
+   struct net *net = sock_net(sk);
+   struct inet_bind_bucket *tb = NULL;
+   kuid_t uid = sock_i_uid(sk);
 
+   if (!port) {
+   head = inet_csk_find_open_port(sk, , );
+   if (!head)
+   return ret;
+   if (!tb)
+   goto tb_not_found;
+   goto success;
+   }
+   head = >bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+   spin_lock_bh(>lock);
+   inet_bind_bucket_for_each(tb, >chain)
+   if (net_eq(ib_net(tb), net) && tb->port == port)
+   goto tb_found;
 tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 net, head, port);
@@ -255,7 +277,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
  !rcu_access_pointer(sk->sk_reuseport_cb) &&
  sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
goto success;
-   if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
+   if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
}
 success:
-- 
2.5.5

[PATCH 3/6 net-next] inet: kill smallest_size and smallest_port

2017-01-11 Thread Josef Bacik

In inet_csk_get_port we seem to be using smallest_port to figure out where the
best place to look for a SO_REUSEPORT sk that matches with an existing set of
SO_REUSEPORT's.  However if we get to the logic

if (smallest_size != -1) {
port = smallest_port;
goto have_port;
}

we will do a useless search, because we would have already done the
inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
would have gone to found_tb and succeeded.  Since this logic makes us do yet
another trip through inet_csk_bind_conflict for a port we know won't work just
delete this code and save us the time.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/inet_hashtables.h   |  1 -
 net/ipv4/inet_connection_sock.c | 26 --
 net/ipv4/inet_hashtables.c  |  3 ---
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 756ed16..3fc0366 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -80,7 +80,6 @@ struct inet_bind_bucket {
signed char fastreuse;
signed char fastreuseport;
kuid_t  fastuid;
-   int num_owners;
struct hlist_node   node;
struct hlist_head   owners;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a1c9055..d352366 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -165,7 +165,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, attempts = 5, port = snum;
-   int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
@@ -175,7 +174,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
bool reuseport_ok = !!snum;
 
if (port) {
-have_port:
head = >bhash[inet_bhashfn(net, port,
  hinfo->bhash_size)];
spin_lock_bh(>lock);
@@ -209,8 +207,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 * We do the opposite to not pollute connect() users.
 */
offset |= 1U;
-   smallest_size = -1;
-   smallest_port = low; /* avoid compiler warning */
 
 other_parity_scan:
port = low + offset;
@@ -224,15 +220,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
spin_lock_bh(>lock);
inet_bind_bucket_for_each(tb, >chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
-   if (((tb->fastreuse > 0 && reuse) ||
-(tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) 
&&
- uid_eq(tb->fastuid, uid))) &&
-   (tb->num_owners < smallest_size || 
smallest_size == -1)) {
-   smallest_size = tb->num_owners;
-   smallest_port = port;
-   }
if (!inet_csk_bind_conflict(sk, tb, false, 
reuseport_ok))
goto tb_found;
goto next_port;
@@ -243,10 +230,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
cond_resched();
}
 
-   if (smallest_size != -1) {
-   port = smallest_port;
-   goto have_port;
-   }
offset--;
if (!(offset & 1))
goto other_parity_scan;
@@ -268,19 +251,18 @@ int inet_csk_get_port(struct sock *sk, unsigned short 
snum)
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
 
-   if (((tb->fastreuse > 0 && reuse) ||
+   if ((tb->fastreuse > 0 && reuse) ||
 (tb->fastreuseport > 0 &&
  !rcu_access_pointer(sk->sk_reuseport_cb) &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-   smallest_size == -1)
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
goto success;
if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
if ((reuse ||
 (tb->fastreuseport > 0 &&

[PATCH 2/6 net-next] inet: drop ->bind_conflict

2017-01-11 Thread Josef Bacik

The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict
is how they check the rcv_saddr, so delete this call back and simply
change inet_csk_bind_conflict to call inet_rcv_saddr_equal.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 include/net/inet6_connection_sock.h |  5 -
 include/net/inet_connection_sock.h  |  6 --
 net/dccp/ipv4.c |  2 +-
 net/dccp/ipv6.c |  2 --
 net/ipv4/inet_connection_sock.c | 22 +++-
 net/ipv4/tcp_ipv4.c |  2 +-
 net/ipv6/inet6_connection_sock.c| 40 -
 net/ipv6/tcp_ipv6.c |  2 --
 8 files changed, 9 insertions(+), 72 deletions(-)

diff --git a/include/net/inet6_connection_sock.h 
b/include/net/inet6_connection_sock.h
index 3212b39..8ec87b6 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,16 +15,11 @@
 
 #include 
 
-struct inet_bind_bucket;
 struct request_sock;
 struct sk_buff;
 struct sock;
 struct sockaddr;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-   const struct inet_bind_bucket *tb, bool relax,
-   bool soreuseport_ok);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 
*fl6,
  const struct request_sock *req, u8 proto);
 
diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 85ee387..add75c7 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,9 +62,6 @@ struct inet_connection_sock_af_ops {
char __user *optval, int __user *optlen);
 #endif
void(*addr2sockaddr)(struct sock *sk, struct sockaddr *);
-   int (*bind_conflict)(const struct sock *sk,
-const struct inet_bind_bucket *tb,
-bool relax, bool soreuseport_ok);
void(*mtu_reduced)(struct sock *sk);
 };
 
@@ -261,9 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock 
*icsk,
 
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-  const struct inet_bind_bucket *tb, bool relax,
-  bool soreuseport_ok);
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d859a5c..ed6f99b 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -904,7 +905,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv4_af_ops = {
.getsockopt= ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in),
-   .bind_conflict = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index adfc790..08bcdc3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv6_af_ops = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
-   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops 
dccp_ipv6_mapped = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
-   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ba597cb..a1c9055 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -116,9 +116,9 @@ void inet_get_local_port_range(struct net *net, int *low, 
int *high)
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-  const struct inet_bind_bucket *tb, bool relax,
-  bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ bool relax, bool reuseport_ok)
 {
struct sock *sk2;
bool reuse = sk->sk_reuse;
@@ -134,7 +134,6 @@ int inet_csk_bind_conflict(const struct

1 2 >

1 - 100 of 152 matches

Mail list logo