[PATCH bpf-next 1/7] bpf: Add bpf_line_info support

2018-12-07 Thread Martin KaFai Lau
This patch adds bpf_line_info support.

It accepts an array of bpf_line_info objects during BPF_PROG_LOAD.
The "line_info", "line_info_cnt" and "line_info_rec_size" are added
to the "union bpf_attr".  The "line_info_rec_size" makes
bpf_line_info extensible in the future.

The new "check_btf_line()" ensures the userspace line_info is valid
for the kernel to use.

When the verifier is translating/patching the bpf_prog (through
"bpf_patch_insn_single()"), the line_infos' insn_off is also
adjusted by the newly added "bpf_adj_linfo()".

If the bpf_prog is jited, this patch also provides the jited addrs (in
aux->jited_linfo) for the corresponding line_info.insn_off.
"bpf_prog_fill_jited_linfo()" is added to fill the aux->jited_linfo.
It is currently called by the x86 jit.  Other jits can also use
"bpf_prog_fill_jited_linfo()" and it will be done in the followup patches.
In the future, if it deemed necessary, a particular jit could also provide
its own "bpf_prog_fill_jited_linfo()" implementation.

A few "*line_info*" fields are added to the bpf_prog_info such
that the user can get the xlated line_info back (i.e. the line_info
with its insn_off reflecting the translated prog).  The jited_line_info
is available if the prog is jited.  It is an array of __u64.
If the prog is not jited, jited_line_info_cnt is 0.

The verifier's verbose log with line_info will be done in
a follow up patch.

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 arch/x86/net/bpf_jit_comp.c  |   2 +
 include/linux/bpf.h  |  21 
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h  |   1 +
 include/linux/filter.h   |   7 ++
 include/uapi/linux/bpf.h |  19 
 kernel/bpf/btf.c |   2 +-
 kernel/bpf/core.c| 118 -
 kernel/bpf/syscall.c |  83 +--
 kernel/bpf/verifier.c| 198 ++-
 10 files changed, 419 insertions(+), 33 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2580cd2e98b1..5542303c43d9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1181,6 +1181,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
}
 
if (!image || !prog->is_func || extra_pass) {
+   if (image)
+   bpf_prog_fill_jited_linfo(prog, addrs);
 out_addrs:
kfree(addrs);
kfree(jit_data);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e82b7039fc66..0c992b86eb2c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,7 +319,28 @@ struct bpf_prog_aux {
struct bpf_prog_offload *offload;
struct btf *btf;
struct bpf_func_info *func_info;
+   /* bpf_line_info loaded from userspace.  linfo->insn_off
+* has the xlated insn offset.
+* Both the main and sub prog share the same linfo.
+* The subprog can access its first linfo by
+* using the linfo_idx.
+*/
+   struct bpf_line_info *linfo;
+   /* jited_linfo is the jited addr of the linfo.  It has a
+* one to one mapping to linfo:
+* jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
+* Both the main and sub prog share the same jited_linfo.
+* The subprog can access its first jited_linfo by
+* using the linfo_idx.
+*/
+   void **jited_linfo;
u32 func_info_cnt;
+   u32 nr_linfo;
+   /* subprog can use linfo_idx to access its first linfo and
+* jited_linfo.
+* main prog always has linfo_idx == 0
+*/
+   u32 linfo_idx;
union {
struct work_struct work;
struct rcu_head rcu;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..c736945be7c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -203,6 +203,7 @@ static inline bool bpf_verifier_log_needed(const struct 
bpf_verifier_log *log)
 
 struct bpf_subprog_info {
u32 start; /* insn idx of function entry point */
+   u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
u16 stack_depth; /* max. stack depth used by this function */
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8c2199b5d250..b98405a56383 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,6 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, 
void *obj,
   struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+bool btf_name_offset_valid(const struct btf *btf, u32 offset);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d16dee

[PATCH bpf-next 6/7] bpf: libbpf: Add btf_line_info support to libbpf

2018-12-07 Thread Martin KaFai Lau
This patch adds bpf_line_info support to libbpf:
1) Parsing the line_info sec from ".BTF.ext"
2) Relocating the line_info.  If the main prog *_info relocation
   fails, it will ignore the remaining subprog line_info and continue.
   If the subprog *_info relocation fails, it will bail out.
3) BPF_PROG_LOAD a prog with line_info

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/lib/bpf/bpf.c|  86 +++--
 tools/lib/bpf/bpf.h|   3 +
 tools/lib/bpf/btf.c| 209 +
 tools/lib/bpf/btf.h|  10 +-
 tools/lib/bpf/libbpf.c |  20 
 5 files changed, 239 insertions(+), 89 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9fbbc0ed5952..3caaa3428774 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -173,11 +173,36 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, 
const char *name,
  -1);
 }
 
+static void *
+alloc_zero_tailing_info(const void *orecord, __u32 cnt,
+   __u32 actual_rec_size, __u32 expected_rec_size)
+{
+   __u64 info_len = actual_rec_size * cnt;
+   void *info, *nrecord;
+   int i;
+
+   info = malloc(info_len);
+   if (!info)
+   return NULL;
+
+   /* zero out bytes kernel does not understand */
+   nrecord = info;
+   for (i = 0; i < cnt; i++) {
+   memcpy(nrecord, orecord, expected_rec_size);
+   memset(nrecord + expected_rec_size, 0,
+  actual_rec_size - expected_rec_size);
+   orecord += actual_rec_size;
+   nrecord += actual_rec_size;
+   }
+
+   return info;
+}
+
 int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
   char *log_buf, size_t log_buf_sz)
 {
+   void *finfo = NULL, *linfo = NULL;
union bpf_attr attr;
-   void *finfo = NULL;
__u32 name_len;
int fd;
 
@@ -201,6 +226,9 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
attr.func_info_rec_size = load_attr->func_info_rec_size;
attr.func_info_cnt = load_attr->func_info_cnt;
attr.func_info = ptr_to_u64(load_attr->func_info);
+   attr.line_info_rec_size = load_attr->line_info_rec_size;
+   attr.line_info_cnt = load_attr->line_info_cnt;
+   attr.line_info = ptr_to_u64(load_attr->line_info);
memcpy(attr.prog_name, load_attr->name,
   min(name_len, BPF_OBJ_NAME_LEN - 1));
 
@@ -212,36 +240,35 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
 * to give user space a hint how to deal with loading failure.
 * Check to see whether we can make some changes and load again.
 */
-   if (errno == E2BIG && attr.func_info_cnt &&
-   attr.func_info_rec_size < load_attr->func_info_rec_size) {
-   __u32 actual_rec_size = load_attr->func_info_rec_size;
-   __u32 expected_rec_size = attr.func_info_rec_size;
-   __u32 finfo_cnt = load_attr->func_info_cnt;
-   __u64 finfo_len = actual_rec_size * finfo_cnt;
-   const void *orecord;
-   void *nrecord;
-   int i;
-
-   finfo = malloc(finfo_len);
-   if (!finfo)
-   /* further try with log buffer won't help */
-   return fd;
-
-   /* zero out bytes kernel does not understand */
-   orecord = load_attr->func_info;
-   nrecord = finfo;
-   for (i = 0; i < load_attr->func_info_cnt; i++) {
-   memcpy(nrecord, orecord, expected_rec_size);
-   memset(nrecord + expected_rec_size, 0,
-  actual_rec_size - expected_rec_size);
-   orecord += actual_rec_size;
-   nrecord += actual_rec_size;
+   while (errno == E2BIG && (!finfo || !linfo)) {
+   if (!finfo && attr.func_info_cnt &&
+   attr.func_info_rec_size < load_attr->func_info_rec_size) {
+   /* try with corrected func info records */
+   finfo = alloc_zero_tailing_info(load_attr->func_info,
+   
load_attr->func_info_cnt,
+   
load_attr->func_info_rec_size,
+   
attr.func_info_rec_size);
+   if (!finfo)
+   goto done;
+
+   attr.func_info = ptr_to_u64(finfo);
+   attr.func_info_rec_size = load_attr->func_info_rec_size;
+   } else if (!linfo && attr.line_info_cnt &&
+  attr.line_in

[PATCH bpf-next 2/7] bpf: tools: Sync uapi bpf.h

2018-12-07 Thread Martin KaFai Lau
Sync uapi bpf.h to tools/include/uapi/linux for
the new bpf_line_info.

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/include/uapi/linux/bpf.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 16263e8827fc..7973c28b24a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -356,6 +356,9 @@ union bpf_attr {
__u32   func_info_rec_size; /* userspace 
bpf_func_info size */
__aligned_u64   func_info;  /* func info */
__u32   func_info_cnt;  /* number of bpf_func_info 
records */
+   __u32   line_info_rec_size; /* userspace 
bpf_line_info size */
+   __aligned_u64   line_info;  /* line info */
+   __u32   line_info_cnt;  /* number of bpf_line_info 
records */
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2679,6 +2682,12 @@ struct bpf_prog_info {
__u32 func_info_rec_size;
__aligned_u64 func_info;
__u32 func_info_cnt;
+   __u32 line_info_cnt;
+   __aligned_u64 line_info;
+   __aligned_u64 jited_line_info;
+   __u32 jited_line_info_cnt;
+   __u32 line_info_rec_size;
+   __u32 jited_line_info_rec_size;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2995,4 +3004,14 @@ struct bpf_func_info {
__u32   type_id;
 };
 
+#define BPF_LINE_INFO_LINE_NUM(line_col)   ((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col)   ((line_col) & 0x3ff)
+
+struct bpf_line_info {
+   __u32   insn_off;
+   __u32   file_name_off;
+   __u32   line_off;
+   __u32   line_col;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
2.17.1



[PATCH bpf-next 3/7] bpf: Refactor and bug fix in test_func_type in test_btf.c

2018-12-07 Thread Martin KaFai Lau
1) bpf_load_program_xattr() is absorbing the EBIG error
   which makes testing this case impossible.  It is replaced
   with a direct syscall(__NR_bpf, BPF_PROG_LOAD,...).
2) The test_func_type() is renamed to test_info_raw() to
   prepare for the new line_info test in the next patch.
3) The bpf_obj_get_info_by_fd() testing for func_info
   is refactored to test_get_finfo().  A new
   test_get_linfo() will be added in the next patch
   for testing line_info purpose.
4) The test->func_info_cnt is checked instead of
   a static value "2".
5) Remove unnecessary "\n" in error message.
6) Adding back info_raw_test_num to the cmd arg such
   that a specific test case can be tested, like
   all other existing tests.

7) Fix a bug in handling expected_prog_load_failure.
   A test could pass even if prog_fd != -1 while
   expected_prog_load_failure is true.
8) The min rec_size check should be < 8 instead of < 4.

Fixes: 4798c4ba3ba9 ("tools/bpf: extends test_btf to test load/retrieve 
func_type info")
Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/testing/selftests/bpf/test_btf.c | 211 +++--
 1 file changed, 125 insertions(+), 86 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index ff0952ea757a..8d5777c89620 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -114,12 +115,13 @@ static struct args {
unsigned int raw_test_num;
unsigned int file_test_num;
unsigned int get_info_test_num;
+   unsigned int info_raw_test_num;
bool raw_test;
bool file_test;
bool get_info_test;
bool pprint_test;
bool always_log;
-   bool func_type_test;
+   bool info_raw_test;
 } args;
 
 static char btf_log_buf[BTF_LOG_BUF_SIZE];
@@ -3051,7 +3053,7 @@ static int test_pprint(void)
return err;
 }
 
-static struct btf_func_type_test {
+static struct prog_info_raw_test {
const char *descr;
const char *str_sec;
__u32 raw_types[MAX_NR_RAW_TYPES];
@@ -3062,7 +3064,7 @@ static struct btf_func_type_test {
__u32 func_info_rec_size;
__u32 func_info_cnt;
bool expected_prog_load_failure;
-} func_type_test[] = {
+} info_raw_tests[] = {
 {
.descr = "func_type (main func + one sub)",
.raw_types = {
@@ -3198,90 +3200,44 @@ static size_t probe_prog_length(const struct bpf_insn 
*fp)
return len + 1;
 }
 
-static int do_test_func_type(int test_num)
+static int test_get_finfo(const struct prog_info_raw_test *test,
+ int prog_fd)
 {
-   const struct btf_func_type_test *test = _type_test[test_num];
-   unsigned int raw_btf_size, info_len, rec_size;
-   int i, btf_fd = -1, prog_fd = -1, err = 0;
-   struct bpf_load_program_attr attr = {};
-   void *raw_btf, *func_info = NULL;
struct bpf_prog_info info = {};
struct bpf_func_info *finfo;
-
-   fprintf(stderr, "%s..", test->descr);
-   raw_btf = btf_raw_create(_tmpl, test->raw_types,
-test->str_sec, test->str_sec_size,
-_btf_size);
-
-   if (!raw_btf)
-   return -1;
-
-   *btf_log_buf = '\0';
-   btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
- btf_log_buf, BTF_LOG_BUF_SIZE,
- args.always_log);
-   free(raw_btf);
-
-   if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) {
-   err = -1;
-   goto done;
-   }
-
-   if (*btf_log_buf && args.always_log)
-   fprintf(stderr, "\n%s", btf_log_buf);
-
-   attr.prog_type = test->prog_type;
-   attr.insns = test->insns;
-   attr.insns_cnt = probe_prog_length(attr.insns);
-   attr.license = "GPL";
-   attr.prog_btf_fd = btf_fd;
-   attr.func_info_rec_size = test->func_info_rec_size;
-   attr.func_info_cnt = test->func_info_cnt;
-   attr.func_info = test->func_info;
-
-   *btf_log_buf = '\0';
-   prog_fd = bpf_load_program_xattr(, btf_log_buf,
-BTF_LOG_BUF_SIZE);
-   if (test->expected_prog_load_failure && prog_fd == -1) {
-   err = 0;
-   goto done;
-   }
-   if (CHECK(prog_fd == -1, "invalid prog_id errno:%d", errno)) {
-   fprintf(stderr, "%s\n", btf_log_buf);
-   err = -1;
-   goto done;
-   }
+   __u32 info_len, rec_size, i;
+   void *func_info = NULL;
+   int err;
 
/* get necessary lens */
info_len = sizeof(struct bpf_prog_info);
err = bpf_obj_get_info_by_fd(prog_fd, ,

[PATCH bpf-next 0/7] Introduce bpf_line_info

2018-12-07 Thread Martin KaFai Lau
This patch series introduces the bpf_line_info.  Please see individual patch
for details.

It will be useful for introspection purpose, like:

[root@arch-fb-vm1 bpf]# ~/devshare/fb-kernel/linux/tools/bpf/bpftool/bpftool 
prog dump jited pinned /sys/fs/bpf/test_btf_haskv
[...]
int test_long_fname_2(struct dummy_tracepoint_args * arg):
bpf_prog_44a040bf25481309_test_long_fname_2:
; static int test_long_fname_2(struct dummy_tracepoint_args *arg)
   0:   push   %rbp
   1:   mov%rsp,%rbp
   4:   sub$0x30,%rsp
   b:   sub$0x28,%rbp
   f:   mov%rbx,0x0(%rbp)
  13:   mov%r13,0x8(%rbp)
  17:   mov%r14,0x10(%rbp)
  1b:   mov%r15,0x18(%rbp)
  1f:   xor%eax,%eax
  21:   mov%rax,0x20(%rbp)
  25:   xor%esi,%esi
; int key = 0;
  27:   mov%esi,-0x4(%rbp)
; if (!arg->sock)
  2a:   mov0x8(%rdi),%rdi
; if (!arg->sock)
  2e:   cmp$0x0,%rdi
  32:   je 0x0070
  34:   mov%rbp,%rsi
; counts = bpf_map_lookup_elem(_map, );
  37:   add$0xfffc,%rsi
  3b:   movabs $0x8881139d7480,%rdi
  45:   add$0x110,%rdi
  4c:   mov0x0(%rsi),%eax
  4f:   cmp$0x4,%rax
  53:   jae0x005e
  55:   shl$0x3,%rax
  59:   add%rdi,%rax
  5c:   jmp0x0060
  5e:   xor%eax,%eax
; if (!counts)
  60:   cmp$0x0,%rax
  64:   je 0x0070
; counts->v6++;
  66:   mov0x4(%rax),%edi
  69:   add$0x1,%rdi
  6d:   mov%edi,0x4(%rax)
  70:   mov0x0(%rbp),%rbx
  74:   mov0x8(%rbp),%r13
  78:   mov0x10(%rbp),%r14
  7c:   mov0x18(%rbp),%r15
  80:   add$0x28,%rbp
  84:   leaveq
  85:   retq
[...]

Martin KaFai Lau (7):
  bpf: Add bpf_line_info support
  bpf: tools: Sync uapi bpf.h
  bpf: Refactor and bug fix in test_func_type in test_btf.c
  bpf: Add unit tests for bpf_line_info
  bpf: libbpf: Refactor and bug fix on the bpf_func_info loading logic
  bpf: libbpf: Add btf_line_info support to libbpf
  bpf: libbpf: bpftool: Print bpf_line_info during prog dump

 arch/x86/net/bpf_jit_comp.c   |   2 +
 include/linux/bpf.h   |  21 +
 include/linux/bpf_verifier.h  |   1 +
 include/linux/btf.h   |   1 +
 include/linux/filter.h|   7 +
 include/uapi/linux/bpf.h  |  19 +
 kernel/bpf/btf.c  |   2 +-
 kernel/bpf/core.c | 118 ++-
 kernel/bpf/syscall.c  |  83 +-
 kernel/bpf/verifier.c | 198 -
 .../bpftool/Documentation/bpftool-prog.rst|  16 +-
 tools/bpf/bpftool/bash-completion/bpftool |   6 +-
 tools/bpf/bpftool/btf_dumper.c|  64 ++
 tools/bpf/bpftool/jit_disasm.c|  23 +-
 tools/bpf/bpftool/main.h  |  23 +-
 tools/bpf/bpftool/prog.c  | 100 ++-
 tools/bpf/bpftool/xlated_dumper.c |  30 +-
 tools/bpf/bpftool/xlated_dumper.h |   7 +-
 tools/include/uapi/linux/bpf.h|  19 +
 tools/lib/bpf/Build   |   2 +-
 tools/lib/bpf/bpf.c   |  93 ++-
 tools/lib/bpf/bpf.h   |   3 +
 tools/lib/bpf/bpf_prog_linfo.c| 253 ++
 tools/lib/bpf/btf.c   | 342 
 tools/lib/bpf/btf.h   |  25 +-
 tools/lib/bpf/libbpf.c| 159 +++-
 tools/lib/bpf/libbpf.h|  13 +
 tools/lib/bpf/libbpf.map  |   4 +
 tools/testing/selftests/bpf/test_btf.c| 790 +++---
 29 files changed, 2036 insertions(+), 388 deletions(-)
 create mode 100644 tools/lib/bpf/bpf_prog_linfo.c

-- 
2.17.1



[PATCH bpf-next 7/7] bpf: libbpf: bpftool: Print bpf_line_info during prog dump

2018-12-07 Thread Martin KaFai Lau
This patch adds print bpf_line_info function in 'prog dump jitted'
and 'prog dump xlated':

[root@arch-fb-vm1 bpf]# ~/devshare/fb-kernel/linux/tools/bpf/bpftool/bpftool 
prog dump jited pinned /sys/fs/bpf/test_btf_haskv
[...]
int test_long_fname_2(struct dummy_tracepoint_args * arg):
bpf_prog_44a040bf25481309_test_long_fname_2:
; static int test_long_fname_2(struct dummy_tracepoint_args *arg)
   0:   push   %rbp
   1:   mov%rsp,%rbp
   4:   sub$0x30,%rsp
   b:   sub$0x28,%rbp
   f:   mov%rbx,0x0(%rbp)
  13:   mov%r13,0x8(%rbp)
  17:   mov%r14,0x10(%rbp)
  1b:   mov%r15,0x18(%rbp)
  1f:   xor%eax,%eax
  21:   mov%rax,0x20(%rbp)
  25:   xor%esi,%esi
; int key = 0;
  27:   mov%esi,-0x4(%rbp)
; if (!arg->sock)
  2a:   mov0x8(%rdi),%rdi
; if (!arg->sock)
  2e:   cmp$0x0,%rdi
  32:   je 0x0070
  34:   mov%rbp,%rsi
; counts = bpf_map_lookup_elem(_map, );
  37:   add$0xfffc,%rsi
  3b:   movabs $0x8881139d7480,%rdi
  45:   add$0x110,%rdi
  4c:   mov0x0(%rsi),%eax
  4f:   cmp$0x4,%rax
  53:   jae0x005e
  55:   shl$0x3,%rax
  59:   add%rdi,%rax
  5c:   jmp0x0060
  5e:   xor%eax,%eax
; if (!counts)
  60:   cmp$0x0,%rax
  64:   je 0x0070
; counts->v6++;
  66:   mov0x4(%rax),%edi
  69:   add$0x1,%rdi
  6d:   mov%edi,0x4(%rax)
  70:   mov0x0(%rbp),%rbx
  74:   mov0x8(%rbp),%r13
  78:   mov0x10(%rbp),%r14
  7c:   mov0x18(%rbp),%r15
  80:   add$0x28,%rbp
  84:   leaveq
  85:   retq
[...]

With linum:
[root@arch-fb-vm1 bpf]# ~/devshare/fb-kernel/linux/tools/bpf/bpftool/bpftool 
prog dump jited pinned /sys/fs/bpf/test_btf_haskv linum
int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
bpf_prog_b07ccb89267cf242__dummy_tracepoint:
; return test_long_fname_1(arg); 
[file:/data/users/kafai/fb-kernel/linux/tools/testing/selftests/bpf/test_btf_haskv.c
 line_num:54 line_col:9]
   0:   push   %rbp
   1:   mov%rsp,%rbp
   4:   sub$0x28,%rsp
   b:   sub$0x28,%rbp
   f:   mov%rbx,0x0(%rbp)
  13:   mov%r13,0x8(%rbp)
  17:   mov%r14,0x10(%rbp)
  1b:   mov%r15,0x18(%rbp)
  1f:   xor%eax,%eax
  21:   mov%rax,0x20(%rbp)
  25:   callq  0x851e
; return test_long_fname_1(arg); 
[file:/data/users/kafai/fb-kernel/linux/tools/testing/selftests/bpf/test_btf_haskv.c
 line_num:54 line_col:2]
  2a:   xor%eax,%eax
  2c:   mov0x0(%rbp),%rbx
  30:   mov0x8(%rbp),%r13
  34:   mov0x10(%rbp),%r14
  38:   mov0x18(%rbp),%r15
  3c:   add$0x28,%rbp
  40:   leaveq
  41:   retq
[...]

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 .../bpftool/Documentation/bpftool-prog.rst|  16 +-
 tools/bpf/bpftool/bash-completion/bpftool |   6 +-
 tools/bpf/bpftool/btf_dumper.c|  64 +
 tools/bpf/bpftool/jit_disasm.c|  23 +-
 tools/bpf/bpftool/main.h  |  23 +-
 tools/bpf/bpftool/prog.c  | 100 ++-
 tools/bpf/bpftool/xlated_dumper.c |  30 ++-
 tools/bpf/bpftool/xlated_dumper.h |   7 +-
 tools/lib/bpf/Build   |   2 +-
 tools/lib/bpf/bpf_prog_linfo.c| 253 ++
 tools/lib/bpf/libbpf.h|  13 +
 tools/lib/bpf/libbpf.map  |   4 +
 12 files changed, 516 insertions(+), 25 deletions(-)
 create mode 100644 tools/lib/bpf/bpf_prog_linfo.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 5524b6dccd85..7c30731a9b73 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -22,8 +22,8 @@ MAP COMMANDS
 =
 
 |  **bpftool** **prog { show | list }** [*PROG*]
-|  **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** 
| **visual**}]
-|  **bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | 
**opcodes**}]
+|  **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** 
| **visual** | **linum**}]
+|  **bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes** 
| **linum**}]
 |  **bpftool** **prog pin** *PROG* *FILE*
 |  **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] 
[**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
 |  **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
@@ -56,7 +56,7 @@ DESCRIPTION
  Output will start with program ID followed by program type and
  zero or more named attributes (depending on kernel version).
 
-   **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | **opcodes** | 
**visual** }]
+   **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | **opcodes** | 
**visual** | **linum** }]
  Dump eBPF instructions of the program from t

[PATCH bpf-next 4/7] bpf: Add unit tests for bpf_line_info

2018-12-07 Thread Martin KaFai Lau
Add unit tests for bpf_line_info for both BPF_PROG_LOAD and
BPF_OBJ_GET_INFO_BY_FD.

jit enabled:
[root@arch-fb-vm1 bpf]# ./test_btf -k 0
BTF prog info raw test[5] (line_info (No subprog)): OK
BTF prog info raw test[6] (line_info (No subprog. insn_off >= prog->len)): OK
BTF prog info raw test[7] (line_info (No subprog. zero tailing line_info): OK
BTF prog info raw test[8] (line_info (No subprog. nonzero tailing line_info)): 
OK
BTF prog info raw test[9] (line_info (subprog)): OK
BTF prog info raw test[10] (line_info (subprog + func_info)): OK
BTF prog info raw test[11] (line_info (subprog. missing 1st func line info)): OK
BTF prog info raw test[12] (line_info (subprog. missing 2nd func line info)): OK
BTF prog info raw test[13] (line_info (subprog. unordered insn offset)): OK

jit disabled:
BTF prog info raw test[5] (line_info (No subprog)): not jited. skipping 
jited_line_info check. OK
BTF prog info raw test[6] (line_info (No subprog. insn_off >= prog->len)): OK
BTF prog info raw test[7] (line_info (No subprog. zero tailing line_info): not 
jited. skipping jited_line_info check. OK
BTF prog info raw test[8] (line_info (No subprog. nonzero tailing line_info)): 
OK
BTF prog info raw test[9] (line_info (subprog)): not jited. skipping 
jited_line_info check. OK
BTF prog info raw test[10] (line_info (subprog + func_info)): not jited. 
skipping jited_line_info check. OK
BTF prog info raw test[11] (line_info (subprog. missing 1st func line info)): OK
BTF prog info raw test[12] (line_info (subprog. missing 2nd func line info)): OK
BTF prog info raw test[13] (line_info (subprog. unordered insn offset)): OK

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/testing/selftests/bpf/test_btf.c | 597 -
 1 file changed, 580 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 8d5777c89620..7707273736ac 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -108,7 +108,7 @@ static int __base_pr(const char *format, ...)
 #define BTF_END_RAW 0xdeadbeef
 #define NAME_TBD 0xdeadb33f
 
-#define MAX_NR_RAW_TYPES 1024
+#define MAX_NR_RAW_U32 1024
 #define BTF_LOG_BUF_SIZE 65535
 
 static struct args {
@@ -137,7 +137,7 @@ struct btf_raw_test {
const char *str_sec;
const char *map_name;
const char *err_str;
-   __u32 raw_types[MAX_NR_RAW_TYPES];
+   __u32 raw_types[MAX_NR_RAW_U32];
__u32 str_sec_size;
enum bpf_map_type map_type;
__u32 key_size;
@@ -156,6 +156,9 @@ struct btf_raw_test {
int str_len_delta;
 };
 
+#define BTF_STR_SEC(str) \
+   .str_sec = str, .str_sec_size = sizeof(str)
+
 static struct btf_raw_test raw_tests[] = {
 /* enum E {
  * E0,
@@ -1858,11 +1861,11 @@ static const char *get_next_str(const char *start, 
const char *end)
return start < end - 1 ? start + 1 : NULL;
 }
 
-static int get_type_sec_size(const __u32 *raw_types)
+static int get_raw_sec_size(const __u32 *raw_types)
 {
int i;
 
-   for (i = MAX_NR_RAW_TYPES - 1;
+   for (i = MAX_NR_RAW_U32 - 1;
 i >= 0 && raw_types[i] != BTF_END_RAW;
 i--)
;
@@ -1874,7 +1877,8 @@ static void *btf_raw_create(const struct btf_header *hdr,
const __u32 *raw_types,
const char *str,
unsigned int str_sec_size,
-   unsigned int *btf_size)
+   unsigned int *btf_size,
+   const char **ret_next_str)
 {
const char *next_str = str, *end_str = str + str_sec_size;
unsigned int size_needed, offset;
@@ -1883,7 +1887,7 @@ static void *btf_raw_create(const struct btf_header *hdr,
uint32_t *ret_types;
void *raw_btf;
 
-   type_sec_size = get_type_sec_size(raw_types);
+   type_sec_size = get_raw_sec_size(raw_types);
if (CHECK(type_sec_size < 0, "Cannot get nr_raw_types"))
return NULL;
 
@@ -1922,6 +1926,8 @@ static void *btf_raw_create(const struct btf_header *hdr,
ret_hdr->str_len = str_sec_size;
 
*btf_size = size_needed;
+   if (ret_next_str)
+   *ret_next_str = next_str;
 
return raw_btf;
 }
@@ -1941,7 +1947,7 @@ static int do_test_raw(unsigned int test_num)
 test->raw_types,
 test->str_sec,
 test->str_sec_size,
-_btf_size);
+_btf_size, NULL);
 
if (!raw_btf)
return -1;
@@ -2018,7 +2024,7 @@ static int test_raw(void)
 struct btf_get_info_test {
const char *descr;
const char *str_sec;
-   __u32 raw_types[MAX_NR_RAW_TYPES];
+   __u32 raw_types[MAX_NR_RAW_U32];
__u32 st

[PATCH bpf-next 5/7] bpf: libbpf: Refactor and bug fix on the bpf_func_info loading logic

2018-12-07 Thread Martin KaFai Lau
This patch refactor and fix a bug in the libbpf's bpf_func_info loading
logic.  The bug fix and refactoring are targeting the same
commit 2993e0515bb4 ("tools/bpf: add support to read .BTF.ext sections")
which is in the bpf-next branch.

1) In bpf_load_program_xattr(), it should retry when errno == E2BIG
   regardless of log_buf and log_buf_sz.  This patch fixes it.

2) btf_ext__reloc_init() and btf_ext__reloc() are essentially
   the same except btf_ext__reloc_init() always has insns_cnt == 0.
   Hence, btf_ext__reloc_init() is removed.

   btf_ext__reloc() is also renamed to btf_ext__reloc_func_info()
   to get ready for the line_info support in the next patch.

3) Consolidate func_info section logic from "btf_ext_parse_hdr()",
   "btf_ext_validate_func_info()" and "btf_ext__new()" to
   a new function "btf_ext_copy_func_info()" such that similar
   logic can be reused by the later libbpf's line_info patch.

4) The next line_info patch will store line_info_cnt instead of
   line_info_len in the bpf_program because the kernel is taking
   line_info_cnt also.  It will save a few "len" to "cnt" conversions
   and will also save some function args.

   Hence, this patch also makes bpf_program to store func_info_cnt
   instead of func_info_len.

5) btf_ext depends on btf.  e.g. the func_info's type_id
   in ".BTF.ext" is not useful when ".BTF" is absent.
   This patch only init the obj->btf_ext pointer after
   it has successfully init the obj->btf pointer.

   This can avoid always checking "obj->btf && obj->btf_ext"
   together for accessing ".BTF.ext".  Checking "obj->btf_ext"
   alone will do.

6) Move "struct btf_sec_func_info" from btf.h to btf.c.
   There is no external usage outside btf.c.

Fixes: 2993e0515bb4 ("tools/bpf: add support to read .BTF.ext sections")
Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/lib/bpf/bpf.c|   7 +-
 tools/lib/bpf/btf.c| 191 -
 tools/lib/bpf/btf.h|  17 +---
 tools/lib/bpf/libbpf.c | 139 --
 4 files changed, 177 insertions(+), 177 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5c3be06bf0dd..9fbbc0ed5952 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -205,7 +205,7 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
   min(name_len, BPF_OBJ_NAME_LEN - 1));
 
fd = sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
-   if (fd >= 0 || !log_buf || !log_buf_sz)
+   if (fd >= 0)
return fd;
 
/* After bpf_prog_load, the kernel may modify certain attributes
@@ -244,10 +244,13 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
 
fd = sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
 
-   if (fd >= 0 || !log_buf || !log_buf_sz)
+   if (fd >= 0)
goto done;
}
 
+   if (!log_buf || !log_buf_sz)
+   goto done;
+
/* Try again with log */
attr.log_buf = ptr_to_u64(log_buf);
attr.log_size = log_buf_sz;
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 85d6446cf832..aa4fa02b13fc 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -43,6 +43,13 @@ struct btf_ext {
__u32 func_info_len;
 };
 
+struct btf_sec_func_info {
+   __u32   sec_name_off;
+   __u32   num_func_info;
+   /* Followed by num_func_info number of bpf func_info records */
+   __u8data[0];
+};
+
 /* The minimum bpf_func_info checked by the loader */
 struct bpf_func_info_min {
__u32   insn_off;
@@ -479,41 +486,66 @@ int btf__get_from_id(__u32 id, struct btf **btf)
return err;
 }
 
-static int btf_ext_validate_func_info(const void *finfo, __u32 size,
- btf_print_fn_t err_log)
+static int btf_ext_copy_func_info(struct btf_ext *btf_ext,
+ __u8 *data, __u32 data_size,
+ btf_print_fn_t err_log)
 {
-   int sec_hdrlen = sizeof(struct btf_sec_func_info);
-   __u32 size_left, num_records, record_size;
+   const struct btf_ext_header *hdr = (struct btf_ext_header *)data;
const struct btf_sec_func_info *sinfo;
-   __u64 total_record_size;
+   __u32 info_left, record_size;
+   /* The start of the info sec (including the __u32 record_size). */
+   const void *info;
+
+   /* data and data_size do not include btf_ext_header from now on */
+   data = data + hdr->hdr_len;
+   data_size -= hdr->hdr_len;
+
+   if (hdr->func_info_off & 0x03) {
+   elog("BTF.ext func_info section is not aligned to 4 bytes\n");
+   return -EINVAL;
+   }
+
+   if (data_si

[PATCH bpf-next 1/4] bpf: Improve the info.func_info and info.func_info_rec_size behavior

2018-12-05 Thread Martin KaFai Lau
1) When bpf_dump_raw_ok() == false and the kernel can provide >=1
   func_info to the userspace, the current behavior is setting
   the info.func_info_cnt to 0 instead of setting info.func_info
   to 0.

   It is different from the behavior in jited_func_lens/nr_jited_func_lens,
   jited_ksyms/nr_jited_ksyms...etc.

   This patch fixes it. (i.e. set func_info to 0 instead of
   func_info_cnt to 0 when bpf_dump_raw_ok() == false).

2) When the userspace passed in info.func_info_cnt == 0, the kernel
   will set the expected func_info size back to the
   info.func_info_rec_size.  It is a way for the userspace to learn
   the kernel expected func_info_rec_size introduced in
   commit 838e96904ff3 ("bpf: Introduce bpf_func_info").

   An exception is the kernel expected size is not set when
   func_info is not available for a bpf_prog.  This makes the
   returned info.func_info_rec_size has different values
   depending on the returned value of info.func_info_cnt.

   This patch sets the kernel expected size to info.func_info_rec_size
   independent of the info.func_info_cnt.

3) The current logic only rejects invalid func_info_rec_size if
   func_info_cnt is non zero.  This patch also rejects invalid
   nonzero info.func_info_rec_size and not equal to the kernel
   expected size.

4) Set info.btf_id as long as prog->aux->btf != NULL.  That will
   setup the later copy_to_user() codes look the same as others
   which then easier to understand and maintain.

   prog->aux->btf is not NULL only if prog->aux->func_info_cnt > 0.

   Breaking up info.btf_id from prog->aux->func_info_cnt is needed
   for the later line info patch anyway.

   A similar change is made to bpf_get_prog_name().

Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info")
Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 kernel/bpf/core.c|  2 +-
 kernel/bpf/syscall.c | 46 +++-
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f93ed667546f..2a73fda1db5f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -410,7 +410,7 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, 
char *sym)
sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 
/* prog->aux->name will be ignored if full btf name is available */
-   if (prog->aux->btf) {
+   if (prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
  
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4445d0d084d8..aa05aa38f4a8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2083,6 +2083,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
 
+   if ((info.func_info_cnt || info.func_info_rec_size) &&
+   info.func_info_rec_size != sizeof(struct bpf_func_info))
+   return -EINVAL;
+
+   info.func_info_rec_size = sizeof(struct bpf_func_info);
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
@@ -2226,35 +2232,23 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog 
*prog,
}
}
 
-   if (prog->aux->btf) {
-   u32 krec_size = sizeof(struct bpf_func_info);
-   u32 ucnt, urec_size;
-
+   if (prog->aux->btf)
info.btf_id = btf_id(prog->aux->btf);
 
-   ucnt = info.func_info_cnt;
-   info.func_info_cnt = prog->aux->func_info_cnt;
-   urec_size = info.func_info_rec_size;
-   info.func_info_rec_size = krec_size;
-   if (ucnt) {
-   /* expect passed-in urec_size is what the kernel 
expects */
-   if (urec_size != info.func_info_rec_size)
-   return -EINVAL;
-
-   if (bpf_dump_raw_ok()) {
-   char __user *user_finfo;
-
-   user_finfo = u64_to_user_ptr(info.func_info);
-   ucnt = min_t(u32, info.func_info_cnt, ucnt);
-   if (copy_to_user(user_finfo, 
prog->aux->func_info,
-krec_size * ucnt))
-   return -EFAULT;
-   } else {
-   info.func_info_cnt = 0;
-   }
+   ulen = info.func_info_cnt;
+   info.func_info_cnt = prog->aux->func_info_cnt;
+   if (info.func_info_cnt && ulen) {
+   if (bpf_dump_raw_ok()) {
+   char

[PATCH bpf-next 4/4] bpf: Expect !info.func_info and insn_off name changes in test_btf/libbpf/bpftool

2018-12-05 Thread Martin KaFai Lau
Similar to info.jited_*, info.func_info could be 0 if
bpf_dump_raw_ok() == false.

This patch makes changes to test_btf and bpftool to expect info.func_info
could be 0.

This patch also makes the needed changes for s/insn_offset/insn_off/.

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/bpf/bpftool/prog.c   |  7 +++
 tools/bpf/bpftool/xlated_dumper.c  |  4 ++--
 tools/lib/bpf/btf.c| 12 ++--
 tools/testing/selftests/bpf/test_btf.c |  8 +++-
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 56db61c5a91f..3148bc0e225b 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -589,6 +589,13 @@ static int do_dump(int argc, char **argv)
goto err_free;
}
 
+   if (func_info && !info.func_info) {
+   /* kernel.kptr_restrict is set.  No func_info available. */
+   free(func_info);
+   func_info = NULL;
+   finfo_cnt = 0;
+   }
+
if ((member_len == _prog_len &&
 info.jited_prog_insns == 0) ||
(member_len == _prog_len &&
diff --git a/tools/bpf/bpftool/xlated_dumper.c 
b/tools/bpf/bpftool/xlated_dumper.c
index e06ac0286a75..131ecd175533 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -261,7 +261,7 @@ void dump_xlated_json(struct dump_data *dd, void *buf, 
unsigned int len,
jsonw_start_object(json_wtr);
 
if (btf && record) {
-   if (record->insn_offset == i) {
+   if (record->insn_off == i) {
btf_dumper_type_only(btf, record->type_id,
 func_sig,
 sizeof(func_sig));
@@ -330,7 +330,7 @@ void dump_xlated_plain(struct dump_data *dd, void *buf, 
unsigned int len,
}
 
if (btf && record) {
-   if (record->insn_offset == i) {
+   if (record->insn_off == i) {
btf_dumper_type_only(btf, record->type_id,
 func_sig,
 sizeof(func_sig));
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index c2d641f3e16e..85d6446cf832 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -45,7 +45,7 @@ struct btf_ext {
 
 /* The minimum bpf_func_info checked by the loader */
 struct bpf_func_info_min {
-   __u32   insn_offset;
+   __u32   insn_off;
__u32   type_id;
 };
 
@@ -670,7 +670,7 @@ int btf_ext__reloc_init(struct btf *btf, struct btf_ext 
*btf_ext,
 
memcpy(data, sinfo->data, records_len);
 
-   /* adjust the insn_offset, the data in .BTF.ext is
+   /* adjust the insn_off, the data in .BTF.ext is
 * the actual byte offset, and the kernel expects
 * the offset in term of bpf_insn.
 *
@@ -681,7 +681,7 @@ int btf_ext__reloc_init(struct btf *btf, struct btf_ext 
*btf_ext,
struct bpf_func_info_min *record;
 
record = data + i * record_size;
-   record->insn_offset /= sizeof(struct bpf_insn);
+   record->insn_off /= sizeof(struct bpf_insn);
}
 
*func_info = data;
@@ -722,15 +722,15 @@ int btf_ext__reloc(struct btf *btf, struct btf_ext 
*btf_ext,
return -ENOMEM;
 
memcpy(data + existing_flen, sinfo->data, records_len);
-   /* adjust insn_offset only, the rest data will be passed
+   /* adjust insn_off only, the rest data will be passed
 * to the kernel.
 */
for (i = 0; i < sinfo->num_func_info; i++) {
struct bpf_func_info_min *record;
 
record = data + existing_flen + i * record_size;
-   record->insn_offset =
-   record->insn_offset / sizeof(struct bpf_insn) +
+   record->insn_off =
+   record->insn_off / sizeof(struct bpf_insn) +
insns_cnt;
}
*func_info = data;
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index bae7308b7ec5..ff0952ea757a 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -3156,7 +3156,7 @@ static struct btf_func_type_test {
 },
 
 {
-   .descr = "func_type (Incorrect bpf_func_info.insn_offset)",
+   .descr = "func_type (Incorrect bpf_func_info.insn_off)

[PATCH bpf-next 2/4] bpf: Change insn_offset to insn_off in bpf_func_info

2018-12-05 Thread Martin KaFai Lau
The later patch will introduce "struct bpf_line_info" which
has member "line_off" and "file_off" referring back to the
string section in btf.  The line_"off" and file_"off"
are more consistent to the naming convention in btf.h that
means "offset" (e.g. name_off in "struct btf_type").

The to-be-added "struct bpf_line_info" also has another
member, "insn_off" which is the same as the "insn_offset"
in "struct bpf_func_info".  Hence, this patch renames "insn_offset"
to "insn_off" for "struct bpf_func_info".

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 include/uapi/linux/bpf.h |  2 +-
 kernel/bpf/verifier.c| 18 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8e12c5f..a84fd232d934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2991,7 +2991,7 @@ struct bpf_flow_keys {
 };
 
 struct bpf_func_info {
-   __u32   insn_offset;
+   __u32   insn_off;
__u32   type_id;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71988337ac14..7658c61c1a88 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4707,24 +4707,24 @@ static int check_btf_func(struct bpf_prog *prog, struct 
bpf_verifier_env *env,
goto free_btf;
}
 
-   /* check insn_offset */
+   /* check insn_off */
if (i == 0) {
-   if (krecord[i].insn_offset) {
+   if (krecord[i].insn_off) {
verbose(env,
-   "nonzero insn_offset %u for the first 
func info record",
-   krecord[i].insn_offset);
+   "nonzero insn_off %u for the first func 
info record",
+   krecord[i].insn_off);
ret = -EINVAL;
goto free_btf;
}
-   } else if (krecord[i].insn_offset <= prev_offset) {
+   } else if (krecord[i].insn_off <= prev_offset) {
verbose(env,
"same or smaller insn offset (%u) than previous 
func info record (%u)",
-   krecord[i].insn_offset, prev_offset);
+   krecord[i].insn_off, prev_offset);
ret = -EINVAL;
goto free_btf;
}
 
-   if (env->subprog_info[i].start != krecord[i].insn_offset) {
+   if (env->subprog_info[i].start != krecord[i].insn_off) {
verbose(env, "func_info BTF section doesn't match 
subprog layout in BPF program\n");
ret = -EINVAL;
goto free_btf;
@@ -4739,7 +4739,7 @@ static int check_btf_func(struct bpf_prog *prog, struct 
bpf_verifier_env *env,
goto free_btf;
}
 
-   prev_offset = krecord[i].insn_offset;
+   prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
 
@@ -4762,7 +4762,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
return;
 
for (i = 0; i < env->subprog_cnt; i++)
-   env->prog->aux->func_info[i].insn_offset = 
env->subprog_info[i].start;
+   env->prog->aux->func_info[i].insn_off = 
env->subprog_info[i].start;
 }
 
 /* check %cur's range satisfies %old's */
-- 
2.17.1



[PATCH bpf-next 3/4] bpf: tools: Sync uapi bpf.h for the name changes in bpf_func_info

2018-12-05 Thread Martin KaFai Lau
This patch sync the name changes in bpf_func_info to
the tools/.

Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/include/uapi/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 64262890feb2..16263e8827fc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2991,7 +2991,7 @@ struct bpf_flow_keys {
 };
 
 struct bpf_func_info {
-   __u32   insn_offset;
+   __u32   insn_off;
__u32   type_id;
 };
 
-- 
2.17.1



[PATCH bpf-next 0/4] Misc improvements on bpf_func_info

2018-12-05 Thread Martin KaFai Lau
The patchset has a few improvements on bpf_func_info:
1. Improvements on the behaviors of info.func_info, info.func_info_cnt
   and info.func_info_rec_size.
2. Name change: s/insn_offset/insn_off/

Please see individual patch for details.

Martin KaFai Lau (4):
  bpf: Improve the info.func_info and info.func_info_rec_size behavior
  bpf: Change insn_offset to insn_off in bpf_func_info
  bpf: tools: Sync uapi bpf.h for the name changes in bpf_func_info
  bpf: Expect !info.func_info and insn_off name changes in
test_btf/libbpf/bpftool

 include/uapi/linux/bpf.h   |  2 +-
 kernel/bpf/core.c  |  2 +-
 kernel/bpf/syscall.c   | 46 +++---
 kernel/bpf/verifier.c  | 18 +-
 tools/bpf/bpftool/prog.c   |  7 
 tools/bpf/bpftool/xlated_dumper.c  |  4 +--
 tools/include/uapi/linux/bpf.h |  2 +-
 tools/lib/bpf/btf.c| 12 +++
 tools/testing/selftests/bpf/test_btf.c |  8 -
 9 files changed, 54 insertions(+), 47 deletions(-)

-- 
2.17.1



[PATCH bpf-next] bpf: Fix memleak in aux->func_info and aux->btf

2018-12-01 Thread Martin KaFai Lau
The aux->func_info and aux->btf are leaked in the error out cases
during bpf_prog_load().  This patch fixes it.

Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info")
Cc: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f9554d9a14e1..4445d0d084d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1560,6 +1560,8 @@ static int bpf_prog_load(union bpf_attr *attr, union 
bpf_attr __user *uattr)
return err;
 
 free_used_maps:
+   kvfree(prog->aux->func_info);
+   btf_put(prog->aux->btf);
bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
 free_prog:
-- 
2.17.1



[PATCH v5 bpf-next 10/13] tools/bpf: do not use pahole if clang/llvm can generate BTF sections

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

Add additional checks in tools/testing/selftests/bpf and
samples/bpf such that if clang/llvm compiler can generate
BTF sections, do not use pahole.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 samples/bpf/Makefile | 8 
 tools/testing/selftests/bpf/Makefile | 8 
 2 files changed, 16 insertions(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index be0a961450bc..35444f4a846b 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -208,12 +208,20 @@ endif
 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
 BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
'usage.*llvm')
+BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
+ $(CLANG) -target bpf -O2 -g -c -x c - -o 
./llvm_btf_verify.o; \
+ readelf -S ./llvm_btf_verify.o | grep BTF; \
+ /bin/rm -f ./llvm_btf_verify.o)
 
+ifneq ($(BTF_LLVM_PROBE),)
+   EXTRA_CFLAGS += -g
+else
 ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
EXTRA_CFLAGS += -g
LLC_FLAGS += -mattr=dwarfris
DWARF2BTF = y
 endif
+endif
 
 # Trick to allow make to be run from this directory
 all:
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 57b4712a6276..1dde03ea1484 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -126,7 +126,14 @@ $(OUTPUT)/test_stack_map.o: test_queue_stack_map.h
 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
 BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
'usage.*llvm')
+BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
+ $(CLANG) -target bpf -O2 -g -c -x c - -o 
./llvm_btf_verify.o; \
+ readelf -S ./llvm_btf_verify.o | grep BTF; \
+ /bin/rm -f ./llvm_btf_verify.o)
 
+ifneq ($(BTF_LLVM_PROBE),)
+   CLANG_FLAGS += -g
+else
 ifneq ($(BTF_LLC_PROBE),)
 ifneq ($(BTF_PAHOLE_PROBE),)
 ifneq ($(BTF_OBJCOPY_PROBE),)
@@ -136,6 +143,7 @@ ifneq ($(BTF_OBJCOPY_PROBE),)
 endif
 endif
 endif
+endif
 
 # Have one program compiled without "-target bpf" to test whether libbpf loads
 # it successfully
-- 
2.17.1



[PATCH v5 bpf-next 13/13] tools/bpf: bpftool: add support for func types

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

This patch added support to print function signature
if btf func_info is available. Note that ksym
now uses function name instead of prog_name as
prog_name has a limit of 16 bytes including
ending '\0'.

The following is a sample output for selftests
test_btf with file test_btf_haskv.o for translated insns
and jited insns respectively.

  $ bpftool prog dump xlated id 1
  int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
 0: (85) call pc+2#bpf_prog_2dcecc18072623fc_test_long_fname_1
 1: (b7) r0 = 0
 2: (95) exit
  int test_long_fname_1(struct dummy_tracepoint_args * arg):
 3: (85) call pc+1#bpf_prog_89d64e4abf0f0126_test_long_fname_2
 4: (95) exit
  int test_long_fname_2(struct dummy_tracepoint_args * arg):
 5: (b7) r2 = 0
 6: (63) *(u32 *)(r10 -4) = r2
 7: (79) r1 = *(u64 *)(r1 +8)
 ...
 22: (07) r1 += 1
 23: (63) *(u32 *)(r0 +4) = r1
 24: (95) exit

  $ bpftool prog dump jited id 1
  int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
  bpf_prog_b07ccb89267cf242__dummy_tracepoint:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
3c:   add$0x28,%rbp
40:   leaveq
41:   retq

  int test_long_fname_1(struct dummy_tracepoint_args * arg):
  bpf_prog_2dcecc18072623fc_test_long_fname_1:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
3a:   add$0x28,%rbp
3e:   leaveq
3f:   retq

  int test_long_fname_2(struct dummy_tracepoint_args * arg):
  bpf_prog_89d64e4abf0f0126_test_long_fname_2:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
80:   add$0x28,%rbp
84:   leaveq
85:   retq

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/bpf/bpftool/btf_dumper.c| 136 ++
 tools/bpf/bpftool/main.h  |   2 +
 tools/bpf/bpftool/prog.c  |  56 
 tools/bpf/bpftool/xlated_dumper.c |  33 
 tools/bpf/bpftool/xlated_dumper.h |   3 +
 5 files changed, 230 insertions(+)

diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 55bc512a1831..c3fd3a7cb787 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -249,3 +249,139 @@ int btf_dumper_type(const struct btf_dumper *d, __u32 
type_id,
 {
return btf_dumper_do_type(d, type_id, 0, data);
 }
+
+#define BTF_PRINT_ARG(...) \
+   do {\
+   pos += snprintf(func_sig + pos, size - pos, \
+   __VA_ARGS__);   \
+   if (pos >= size)\
+   return -1;  \
+   } while (0)
+#define BTF_PRINT_TYPE(type)   \
+   do {\
+   pos = __btf_dumper_type_only(btf, type, func_sig,   \
+pos, size);\
+   if (pos == -1)  \
+   return -1;  \
+   } while (0)
+
+static int btf_dump_func(const struct btf *btf, char *func_sig,
+const struct btf_type *func_proto,
+const struct btf_type *func, int pos, int size);
+
+static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id,
+ char *func_sig, int pos, int size)
+{
+   const struct btf_type *proto_type;
+   const struct btf_array *array;
+   const struct btf_type *t;
+
+   if (!type_id) {
+   BTF_PRINT_ARG("void ");
+   return pos;
+   }
+
+   t = btf__type_by_id(btf, type_id);
+
+   switch (BTF_INFO_KIND(t->info)) {
+   case BTF_KIND_INT:
+   BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off));
+   break;
+   case BTF_KIND_STRUCT:
+   BTF_PRINT_ARG("struct %s ",
+ btf__name_by_offset(btf, t->name_off));
+   break;
+   case BTF_KIND_UNION:
+   BTF_PRINT_ARG("union %s ",
+ btf__name_by_offset(btf, t->name_off));
+   break;
+   case BTF_KIND_ENUM:
+   BTF_PRINT_ARG("enum %s ",
+ btf__name_by_offset(btf, t->name_off));
+   break;
+   case BTF_KIND_ARRAY:
+   array = (struct btf_array *)(t + 1);
+   BTF_PRINT_TYPE(array->type);
+   BTF_PRINT_ARG("[%d]", array->nelems);
+   break;
+   case BTF_KIND_PTR:
+   BTF_PRINT_TYPE(t->type);
+   BTF_PRINT_ARG("* ");
+ 

[PATCH v5 bpf-next 09/13] tools/bpf: add support to read .BTF.ext sections

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

The .BTF section is already available to encode types.
These types can be used for map
pretty print. The whole .BTF will be passed to the
kernel as well for which kernel can verify and return
to the user space for pretty print etc.

The llvm patch at https://reviews.llvm.org/D53736
will generate .BTF section and one more section .BTF.ext.
The .BTF.ext section encodes function type
information and line information. Note that
this patch set only supports function type info.
The functionality is implemented in libbpf.

The .BTF section can be directly loaded into the
kernel, and the .BTF.ext section cannot. The loader
may need to do some relocation and merging,
similar to merging multiple code sections, before
loading into the kernel.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/bpf.c|  46 ++-
 tools/lib/bpf/btf.c| 274 +
 tools/lib/bpf/btf.h|  50 
 tools/lib/bpf/libbpf.c |  87 ++---
 4 files changed, 442 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 06fc5e91ac51..ce1822194590 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -177,6 +177,7 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
   char *log_buf, size_t log_buf_sz)
 {
union bpf_attr attr;
+   void *finfo = NULL;
__u32 name_len;
int fd;
 
@@ -207,12 +208,55 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
if (fd >= 0 || !log_buf || !log_buf_sz)
return fd;
 
+   /* After bpf_prog_load, the kernel may modify certain attributes
+* to give user space a hint how to deal with loading failure.
+* Check to see whether we can make some changes and load again.
+*/
+   if (errno == E2BIG && attr.func_info_cnt &&
+   attr.func_info_rec_size < load_attr->func_info_rec_size) {
+   __u32 actual_rec_size = load_attr->func_info_rec_size;
+   __u32 expected_rec_size = attr.func_info_rec_size;
+   __u32 finfo_cnt = load_attr->func_info_cnt;
+   __u64 finfo_len = actual_rec_size * finfo_cnt;
+   const void *orecord;
+   void *nrecord;
+   int i;
+
+   finfo = malloc(finfo_len);
+   if (!finfo)
+   /* further try with log buffer won't help */
+   return fd;
+
+   /* zero out bytes kernel does not understand */
+   orecord = load_attr->func_info;
+   nrecord = finfo;
+   for (i = 0; i < load_attr->func_info_cnt; i++) {
+   memcpy(nrecord, orecord, expected_rec_size);
+   memset(nrecord + expected_rec_size, 0,
+  actual_rec_size - expected_rec_size);
+   orecord += actual_rec_size;
+   nrecord += actual_rec_size;
+   }
+
+   /* try with corrected func info records */
+   attr.func_info = ptr_to_u64(finfo);
+   attr.func_info_rec_size = load_attr->func_info_rec_size;
+
+   fd = sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
+
+   if (fd >= 0 || !log_buf || !log_buf_sz)
+   goto done;
+   }
+
/* Try again with log */
attr.log_buf = ptr_to_u64(log_buf);
attr.log_size = log_buf_sz;
attr.log_level = 1;
log_buf[0] = 0;
-   return sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
+   fd = sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
+done:
+   free(finfo);
+   return fd;
 }
 
 int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 31225e64766f..fe87cb48a6a9 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -37,6 +37,18 @@ struct btf {
int fd;
 };
 
+struct btf_ext {
+   void *func_info;
+   __u32 func_info_rec_size;
+   __u32 func_info_len;
+};
+
+/* The minimum bpf_func_info checked by the loader */
+struct bpf_func_info_min {
+   __u32   insn_offset;
+   __u32   type_id;
+};
+
 static int btf_add_type(struct btf *btf, struct btf_type *t)
 {
if (btf->types_size - btf->nr_types < 2) {
@@ -397,3 +409,265 @@ const char *btf__name_by_offset(const struct btf *btf, 
__u32 offset)
else
return NULL;
 }
+
+static int btf_ext_validate_func_info(const void *finfo, __u32 size,
+ btf_print_fn_t err_log)
+{
+   int sec_hdrlen = sizeof(struct btf_sec_func_info);
+   __u32 size_left, num_records, record_size;
+   const struct btf_sec_func_info *sinfo;
+   __u64 total_record_size;
+
+   /* At least a func_info record size */
+   if (size < sizeo

[PATCH v5 bpf-next 11/13] tools/bpf: refactor to implement btf_get_from_id() in lib/bpf

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

The function get_btf() is implemented in tools/bpf/bpftool/map.c
to get a btf structure given a map_info. This patch
refactored this function to be function btf_get_from_id()
in tools/lib/bpf so that it can be used later.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/bpf/bpftool/map.c | 68 ++--
 tools/lib/bpf/btf.c | 69 +
 tools/lib/bpf/btf.h |  1 +
 3 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index dc9a8967ab8c..a1ae2a3e9fef 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -215,70 +215,6 @@ static int do_dump_btf(const struct btf_dumper *d,
return ret;
 }
 
-static int get_btf(struct bpf_map_info *map_info, struct btf **btf)
-{
-   struct bpf_btf_info btf_info = { 0 };
-   __u32 len = sizeof(btf_info);
-   __u32 last_size;
-   int btf_fd;
-   void *ptr;
-   int err;
-
-   err = 0;
-   *btf = NULL;
-   btf_fd = bpf_btf_get_fd_by_id(map_info->btf_id);
-   if (btf_fd < 0)
-   return 0;
-
-   /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so
-* let's start with a sane default - 4KiB here - and resize it only if
-* bpf_obj_get_info_by_fd() needs a bigger buffer.
-*/
-   btf_info.btf_size = 4096;
-   last_size = btf_info.btf_size;
-   ptr = malloc(last_size);
-   if (!ptr) {
-   err = -ENOMEM;
-   goto exit_free;
-   }
-
-   bzero(ptr, last_size);
-   btf_info.btf = ptr_to_u64(ptr);
-   err = bpf_obj_get_info_by_fd(btf_fd, _info, );
-
-   if (!err && btf_info.btf_size > last_size) {
-   void *temp_ptr;
-
-   last_size = btf_info.btf_size;
-   temp_ptr = realloc(ptr, last_size);
-   if (!temp_ptr) {
-   err = -ENOMEM;
-   goto exit_free;
-   }
-   ptr = temp_ptr;
-   bzero(ptr, last_size);
-   btf_info.btf = ptr_to_u64(ptr);
-   err = bpf_obj_get_info_by_fd(btf_fd, _info, );
-   }
-
-   if (err || btf_info.btf_size > last_size) {
-   err = errno;
-   goto exit_free;
-   }
-
-   *btf = btf__new((__u8 *)btf_info.btf, btf_info.btf_size, NULL);
-   if (IS_ERR(*btf)) {
-   err = PTR_ERR(*btf);
-   *btf = NULL;
-   }
-
-exit_free:
-   close(btf_fd);
-   free(ptr);
-
-   return err;
-}
-
 static json_writer_t *get_btf_writer(void)
 {
json_writer_t *jw = jsonw_new(stdout);
@@ -775,7 +711,7 @@ static int do_dump(int argc, char **argv)
 
prev_key = NULL;
 
-   err = get_btf(, );
+   err = btf_get_from_id(info.btf_id, );
if (err) {
p_err("failed to get btf");
goto exit_free;
@@ -919,7 +855,7 @@ static int do_lookup(int argc, char **argv)
}
 
/* here means bpf_map_lookup_elem() succeeded */
-   err = get_btf(, );
+   err = btf_get_from_id(info.btf_id, );
if (err) {
p_err("failed to get btf");
goto exit_free;
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index fe87cb48a6a9..13ddc4bd24ee 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -49,6 +49,11 @@ struct bpf_func_info_min {
__u32   type_id;
 };
 
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
 static int btf_add_type(struct btf *btf, struct btf_type *t)
 {
if (btf->types_size - btf->nr_types < 2) {
@@ -410,6 +415,70 @@ const char *btf__name_by_offset(const struct btf *btf, 
__u32 offset)
return NULL;
 }
 
+int btf_get_from_id(__u32 id, struct btf **btf)
+{
+   struct bpf_btf_info btf_info = { 0 };
+   __u32 len = sizeof(btf_info);
+   __u32 last_size;
+   int btf_fd;
+   void *ptr;
+   int err;
+
+   err = 0;
+   *btf = NULL;
+   btf_fd = bpf_btf_get_fd_by_id(id);
+   if (btf_fd < 0)
+   return 0;
+
+   /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so
+* let's start with a sane default - 4KiB here - and resize it only if
+* bpf_obj_get_info_by_fd() needs a bigger buffer.
+*/
+   btf_info.btf_size = 4096;
+   last_size = btf_info.btf_size;
+   ptr = malloc(last_size);
+   if (!ptr) {
+   err = -ENOMEM;
+   goto exit_free;
+   }
+
+   bzero(ptr, last_size);
+   btf_info.btf = ptr_to_u64(ptr);
+   err = bpf_obj_get_info_by_fd(btf_fd, _info, );
+
+   if (!err && btf_info.btf_size > last_size) {
+   void *temp_ptr;
+
+   last_size = btf_info.btf_size;
+   te

[PATCH v5 bpf-next 03/13] tools/bpf: Sync kernel btf.h header

2018-11-19 Thread Martin KaFai Lau
The kernel uapi btf.h is synced to the tools directory.

Signed-off-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 tools/include/uapi/linux/btf.h | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 972265f32871..14f66948fc95 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -40,7 +40,8 @@ struct btf_type {
/* "size" is used by INT, ENUM, STRUCT and UNION.
 * "size" tells the size of the type it is describing.
 *
-* "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+* "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+* FUNC and FUNC_PROTO.
 * "type" is a type_id referring to another type.
 */
union {
@@ -64,8 +65,10 @@ struct btf_type {
 #define BTF_KIND_VOLATILE  9   /* Volatile */
 #define BTF_KIND_CONST 10  /* Const*/
 #define BTF_KIND_RESTRICT  11  /* Restrict */
-#define BTF_KIND_MAX   11
-#define NR_BTF_KINDS   12
+#define BTF_KIND_FUNC  12  /* Function */
+#define BTF_KIND_FUNC_PROTO13  /* Function Proto   */
+#define BTF_KIND_MAX   13
+#define NR_BTF_KINDS   14
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -110,4 +113,13 @@ struct btf_member {
__u32   offset; /* offset in bits */
 };
 
+/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
+ * The exact number of btf_param is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_param {
+   __u32   name_off;
+   __u32   type;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
-- 
2.17.1



[PATCH v5 bpf-next 00/13] bpf: Add btf func info support

2018-11-19 Thread Martin KaFai Lau
The BTF support was added to kernel by Commit 69b693f0aefa
("bpf: btf: Introduce BPF Type Format (BTF)"), which introduced
.BTF section into ELF file and is primarily
used for map pretty print.
pahole is used to convert dwarf to BTF for ELF files.

This patch added func info support to the kernel so we can
get better ksym's for bpf function calls. Basically,
function call types are passed to kernel and the kernel
extract function names from these types in order to contruct ksym
for these functions.

The llvm patch at 
https://urldefense.proofpoint.com/v2/url?u=https-3A__reviews.llvm.org_D53736=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=VQnoQ7LvghIj0gVEaiQSUw=CGaBxxwoBOirzU2H_1txiXzzOWCa2Z8ihJYI3JUJhOI=kwigBzxoDmvGui52AFHvn7a3oTNYBUEWKdxyTaHKjpA=
will generate .BTF section and one more section .BTF.ext.
The .BTF.ext section encodes function type
information. The following is a sample output for selftests
test_btf with file test_btf_haskv.o for translated insns
and jited insns respectively.

  $ bpftool prog dump xlated id 1
  int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
 0: (85) call pc+2#bpf_prog_2dcecc18072623fc_test_long_fname_1
 1: (b7) r0 = 0
 2: (95) exit
  int test_long_fname_1(struct dummy_tracepoint_args * arg):
 3: (85) call pc+1#bpf_prog_89d64e4abf0f0126_test_long_fname_2
 4: (95) exit
  int test_long_fname_2(struct dummy_tracepoint_args * arg):
 5: (b7) r2 = 0
 6: (63) *(u32 *)(r10 -4) = r2
 7: (79) r1 = *(u64 *)(r1 +8)
 ...
 22: (07) r1 += 1
 23: (63) *(u32 *)(r0 +4) = r1
 24: (95) exit

  $ bpftool prog dump jited id 1
  int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
  bpf_prog_b07ccb89267cf242__dummy_tracepoint:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
3c:   add$0x28,%rbp
40:   leaveq
41:   retq

  int test_long_fname_1(struct dummy_tracepoint_args * arg):
  bpf_prog_2dcecc18072623fc_test_long_fname_1:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
3a:   add$0x28,%rbp
3e:   leaveq
3f:   retq

  int test_long_fname_2(struct dummy_tracepoint_args * arg):
  bpf_prog_89d64e4abf0f0126_test_long_fname_2:
 0:   push   %rbp
 1:   mov%rsp,%rbp
..
80:   add$0x28,%rbp
84:   leaveq
85:   retq

Changelogs:
  v4 -> v5:
. Add back BTF_KIND_FUNC_PROTO as v1 did.  The difference
  is BTF_KIND_FUNC_PROTO cannot have t->name_off now.
  All param metadata is defined in BTF_KIND_FUNC_PROTO.
  BTF_KIND_FUNC must have t->name_off != 0 and t->type
  refers to a BTF_KIND_FUNC_PROTO.

  The above is the conclusion after the discussion between
  Edward Cree, Alexei, Daniel, Yonghong and Martin.
  v3 -> v4:
. Remove BTF_KIND_FUNC_PROTO. BTF_KIND_FUNC is used for
  both function pointer and subprogram. The name_off field
  is used to distinguish both.
. The record size is added to the func_info subsection
  in .BTF.ext to enable future extension.
. The bpf_prog_info interface change to make it similar
  bpf_prog_load.
. Related kernel and libbpf changes to accommodate the
  new .BTF.ext and kernel interface changes.
  v2 -> v3:
. Removed kernel btf extern functions btf_type_id_func()
  and btf_get_name_by_id(). Instead, exposing existing
  functions btf_type_by_id() and btf_name_by_offset().
. Added comments about ELF section .BTF.ext layout.
. Better codes in btftool as suggested by Edward Cree.
  v1 -> v2:
. Added missing sign-off.
. Limited the func_name/struct_member_name length for validity test.
. Removed/changed several verifier messages.
. Modified several commit messages to remove line_off reference.

Martin KaFai Lau (4):
  bpf: btf: Break up btf_type_is_void()
  bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
  tools/bpf: Sync kernel btf.h header
  tools/bpf: Add tests for BTF_KIND_FUNC_PROTO and BTF_KIND_FUNC

Yonghong Song (9):
  bpf: Introduce bpf_func_info
  tools/bpf: sync kernel uapi bpf.h header to tools directory
  tools/bpf: add new fields for program load in lib/bpf
  tools/bpf: extends test_btf to test load/retrieve func_type info
  tools/bpf: add support to read .BTF.ext sections
  tools/bpf: do not use pahole if clang/llvm can generate BTF sections
  tools/bpf: refactor to implement btf_get_from_id() in lib/bpf
  tools/bpf: enhance test_btf file testing to test func info
  tools/bpf: bpftool: add support for func types

 include/linux/bpf.h  |   5 +-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h  |   2 +
 include/uapi/linux/bpf.h |  13 +
 include/uapi/linux/btf.h |  18 +-
 kernel/bpf/btf.c | 420 +++--
 kernel/bpf/core.c|  13 +
 kernel/bpf/syscall.c |  59 +-
 kernel/bpf/verifier.c

[PATCH v5 bpf-next 04/13] tools/bpf: Add tests for BTF_KIND_FUNC_PROTO and BTF_KIND_FUNC

2018-11-19 Thread Martin KaFai Lau
This patch adds unit tests for BTF_KIND_FUNC_PROTO and
BTF_KIND_FUNC to test_btf.

Signed-off-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 tools/lib/bpf/btf.c|   4 +
 tools/testing/selftests/bpf/test_btf.c | 474 -
 2 files changed, 476 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 449591aa9900..31225e64766f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -165,6 +165,10 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
case BTF_KIND_ENUM:
next_type += vlen * sizeof(struct btf_enum);
break;
+   case BTF_KIND_FUNC_PROTO:
+   next_type += vlen * sizeof(struct btf_param);
+   break;
+   case BTF_KIND_FUNC:
case BTF_KIND_TYPEDEF:
case BTF_KIND_PTR:
case BTF_KIND_FWD:
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index f42b3396d622..e05c8c04 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -85,8 +85,20 @@ static int __base_pr(const char *format, ...)
 #define BTF_TYPEDEF_ENC(name, type) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
 
-#define BTF_PTR_ENC(name, type) \
-   BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
+#define BTF_PTR_ENC(type) \
+   BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
+
+#define BTF_CONST_ENC(type) \
+   BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), type)
+
+#define BTF_FUNC_PROTO_ENC(ret_type, nargs) \
+   BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, nargs), ret_type)
+
+#define BTF_FUNC_PROTO_ARG_ENC(name, type) \
+   (name), (type)
+
+#define BTF_FUNC_ENC(name, func_proto) \
+   BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto)
 
 #define BTF_END_RAW 0xdeadbeef
 #define NAME_TBD 0xdeadb33f
@@ -1374,6 +1386,464 @@ static struct btf_raw_test raw_tests[] = {
.map_create_err = true,
 },
 
+{
+   .descr = "func proto (int (*)(int, unsigned int))",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+   BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),   /* [2] */
+   /* int (*)(int, unsigned int) */
+   BTF_FUNC_PROTO_ENC(1, 2),   /* [3] */
+   BTF_FUNC_PROTO_ARG_ENC(0, 1),
+   BTF_FUNC_PROTO_ARG_ENC(0, 2),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "func_proto_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+},
+
+{
+   .descr = "func proto (vararg)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+   BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),   /* [2] */
+   /* void (*)(int, unsigned int, ...) */
+   BTF_FUNC_PROTO_ENC(0, 3),   /* [3] */
+   BTF_FUNC_PROTO_ARG_ENC(0, 1),
+   BTF_FUNC_PROTO_ARG_ENC(0, 2),
+   BTF_FUNC_PROTO_ARG_ENC(0, 0),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "func_proto_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+},
+
+{
+   .descr = "func proto (vararg with name)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+   BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),   /* [2] */
+   /* void (*)(int a, unsigned int b, ... c) */
+   BTF_FUNC_PROTO_ENC(0, 3),   /* [3] */
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 0),
+   BTF_END_RAW,
+   },
+   .str_sec = "\0a\0b\0c",
+   .str_sec_size = sizeof("\0a\0b\0c"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "func_proto_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid arg#3",
+},
+
+{
+   .descr = "func proto (arg after vararg)",
+   

[PATCH v5 bpf-next 12/13] tools/bpf: enhance test_btf file testing to test func info

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

Change the bpf programs test_btf_haskv.c and test_btf_nokv.c to
have two sections, and enhance test_btf.c test_file feature
to test btf func_info returned by the kernel.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/test_btf.c   | 117 +--
 tools/testing/selftests/bpf/test_btf_haskv.c |  16 ++-
 tools/testing/selftests/bpf/test_btf_nokv.c  |  16 ++-
 3 files changed, 136 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 8fd3a16fea4d..7b1b160d6e67 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -2434,13 +2434,13 @@ static struct btf_file_test file_tests[] = {
 },
 };
 
-static int file_has_btf_elf(const char *fn)
+static int file_has_btf_elf(const char *fn, bool *has_btf_ext)
 {
Elf_Scn *scn = NULL;
GElf_Ehdr ehdr;
+   int ret = 0;
int elf_fd;
Elf *elf;
-   int ret;
 
if (CHECK(elf_version(EV_CURRENT) == EV_NONE,
  "elf_version(EV_CURRENT) == EV_NONE"))
@@ -2472,14 +2472,12 @@ static int file_has_btf_elf(const char *fn)
}
 
sh_name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name);
-   if (!strcmp(sh_name, BTF_ELF_SEC)) {
+   if (!strcmp(sh_name, BTF_ELF_SEC))
ret = 1;
-   goto done;
-   }
+   if (!strcmp(sh_name, BTF_EXT_ELF_SEC))
+   *has_btf_ext = true;
}
 
-   ret = 0;
-
 done:
close(elf_fd);
elf_end(elf);
@@ -2489,15 +2487,24 @@ static int file_has_btf_elf(const char *fn)
 static int do_test_file(unsigned int test_num)
 {
const struct btf_file_test *test = _tests[test_num - 1];
+   const char *expected_fnames[] = {"_dummy_tracepoint",
+"test_long_fname_1",
+"test_long_fname_2"};
+   struct bpf_prog_info info = {};
struct bpf_object *obj = NULL;
+   struct bpf_func_info *finfo;
struct bpf_program *prog;
+   __u32 info_len, rec_size;
+   bool has_btf_ext = false;
+   struct btf *btf = NULL;
+   void *func_info = NULL;
struct bpf_map *map;
-   int err;
+   int i, err, prog_fd;
 
fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num,
test->file);
 
-   err = file_has_btf_elf(test->file);
+   err = file_has_btf_elf(test->file, _btf_ext);
if (err == -1)
return err;
 
@@ -2525,6 +2532,7 @@ static int do_test_file(unsigned int test_num)
err = bpf_object__load(obj);
if (CHECK(err < 0, "bpf_object__load: %d", err))
goto done;
+   prog_fd = bpf_program__fd(prog);
 
map = bpf_object__find_map_by_name(obj, "btf_map");
if (CHECK(!map, "btf_map not found")) {
@@ -2539,9 +2547,100 @@ static int do_test_file(unsigned int test_num)
  test->btf_kv_notfound))
goto done;
 
+   if (!jit_enabled || !has_btf_ext)
+   goto skip_jit;
+
+   /* get necessary program info */
+   info_len = sizeof(struct bpf_prog_info);
+   err = bpf_obj_get_info_by_fd(prog_fd, , _len);
+
+   if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) {
+   fprintf(stderr, "%s\n", btf_log_buf);
+   err = -1;
+   goto done;
+   }
+   if (CHECK(info.func_info_cnt != 3,
+ "incorrect info.func_info_cnt (1st) %d",
+ info.func_info_cnt)) {
+   err = -1;
+   goto done;
+   }
+   rec_size = info.func_info_rec_size;
+   if (CHECK(rec_size < 4,
+ "incorrect info.func_info_rec_size (1st) %d\n", rec_size)) {
+   err = -1;
+   goto done;
+   }
+
+   func_info = malloc(info.func_info_cnt * rec_size);
+   if (CHECK(!func_info, "out of memeory")) {
+   err = -1;
+   goto done;
+   }
+
+   /* reset info to only retrieve func_info related data */
+   memset(, 0, sizeof(info));
+   info.func_info_cnt = 3;
+   info.func_info_rec_size = rec_size;
+   info.func_info = ptr_to_u64(func_info);
+
+   err = bpf_obj_get_info_by_fd(prog_fd, , _len);
+
+   if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) {
+   fprintf(stderr, "%s\n", btf_log_buf);
+   err = -1;
+   goto done;
+   }
+   if (CHECK(info.func_info_cnt != 3,
+ "incorrect info.func_info_cnt (2nd) %d",
+ info.func_info_cnt)) {
+

[PATCH v5 bpf-next 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-11-19 Thread Martin KaFai Lau
This patch adds BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
to support the function debug info.

BTF_KIND_FUNC_PROTO must not have a name (i.e. !t->name_off)
and it is followed by >= 0 'struct bpf_param' objects to
describe the function arguments.

The BTF_KIND_FUNC must have a valid name and it must
refer back to a BTF_KIND_FUNC_PROTO.

The above is the conclusion after the discussion between
Edward Cree, Alexei, Daniel, Yonghong and Martin.

By combining BTF_KIND_FUNC and BTF_LIND_FUNC_PROTO,
a complete function signature can be obtained.  It will be
used in the later patches to learn the function signature of
a running bpf program.

Signed-off-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 include/uapi/linux/btf.h |  18 +-
 kernel/bpf/btf.c | 389 ++-
 2 files changed, 354 insertions(+), 53 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 972265f32871..14f66948fc95 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -40,7 +40,8 @@ struct btf_type {
/* "size" is used by INT, ENUM, STRUCT and UNION.
 * "size" tells the size of the type it is describing.
 *
-* "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+* "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+* FUNC and FUNC_PROTO.
 * "type" is a type_id referring to another type.
 */
union {
@@ -64,8 +65,10 @@ struct btf_type {
 #define BTF_KIND_VOLATILE  9   /* Volatile */
 #define BTF_KIND_CONST 10  /* Const*/
 #define BTF_KIND_RESTRICT  11  /* Restrict */
-#define BTF_KIND_MAX   11
-#define NR_BTF_KINDS   12
+#define BTF_KIND_FUNC  12  /* Function */
+#define BTF_KIND_FUNC_PROTO13  /* Function Proto   */
+#define BTF_KIND_MAX   13
+#define NR_BTF_KINDS   14
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -110,4 +113,13 @@ struct btf_member {
__u32   offset; /* offset in bits */
 };
 
+/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
+ * The exact number of btf_param is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_param {
+   __u32   name_off;
+   __u32   type;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2a50d87de485..6a2be79b73fc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST]= "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
+   [BTF_KIND_FUNC] = "FUNC",
+   [BTF_KIND_FUNC_PROTO]   = "FUNC_PROTO",
 };
 
 struct btf_kind_operations {
@@ -281,6 +284,9 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static int btf_resolve(struct btf_verifier_env *env,
+  const struct btf_type *t, u32 type_id);
+
 static bool btf_type_is_modifier(const struct btf_type *t)
 {
/* Some of them is not strictly a C modifier
@@ -314,9 +320,20 @@ static bool btf_type_is_fwd(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
 }
 
+static bool btf_type_is_func(const struct btf_type *t)
+{
+   return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+   return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
 static bool btf_type_nosize(const struct btf_type *t)
 {
-   return btf_type_is_void(t) || btf_type_is_fwd(t);
+   return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+  btf_type_is_func(t) || btf_type_is_func_proto(t);
 }
 
 static bool btf_type_nosize_or_null(const struct btf_type *t)
@@ -433,6 +450,30 @@ static bool btf_name_offset_valid(const struct btf *btf, 
u32 offset)
offset < btf->hdr.str_len;
 }
 
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+   /* offset must be valid */
+   const char *src = >strings[offset];
+   const char *src_limit;
+
+   if (!isalpha(*src) && *src != '_')
+   return false;
+
+   /* set a limit on identifier length */
+   src_limit = src + KSYM_NAME_LEN;
+   src++;
+   while (*src && src < src_limit) {
+   if (!isalnum(*src) && *src != '_')
+   return false;
+   

[PATCH v5 bpf-next 08/13] tools/bpf: extends test_btf to test load/retrieve func_type info

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

A two function bpf program is loaded with btf and func_info.
After successful prog load, the bpf_get_info syscall is called
to retrieve prog info to ensure the types returned from the
kernel matches the types passed to the kernel from the
user space.

Several negative tests are also added to test loading/retriving
of func_type info.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/test_btf.c | 332 -
 1 file changed, 329 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index e05c8c04..8fd3a16fea4d 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -22,9 +23,13 @@
 #include "bpf_rlimit.h"
 #include "bpf_util.h"
 
+#define MAX_INSNS  512
+#define MAX_SUBPROGS   16
+
 static uint32_t pass_cnt;
 static uint32_t error_cnt;
 static uint32_t skip_cnt;
+static bool jit_enabled;
 
 #define CHECK(condition, format...) ({ \
int __ret = !!(condition);  \
@@ -60,6 +65,24 @@ static int __base_pr(const char *format, ...)
return err;
 }
 
+static bool is_jit_enabled(void)
+{
+   const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
+   bool enabled = false;
+   int sysctl_fd;
+
+   sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
+   if (sysctl_fd != -1) {
+   char tmpc;
+
+   if (read(sysctl_fd, , sizeof(tmpc)) == 1)
+   enabled = (tmpc != '0');
+   close(sysctl_fd);
+   }
+
+   return enabled;
+}
+
 #define BTF_INFO_ENC(kind, root, vlen) \
((!!(root) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
 
@@ -115,6 +138,7 @@ static struct args {
bool get_info_test;
bool pprint_test;
bool always_log;
+   bool func_type_test;
 } args;
 
 static char btf_log_buf[BTF_LOG_BUF_SIZE];
@@ -2947,16 +2971,310 @@ static int test_pprint(void)
return err;
 }
 
+static struct btf_func_type_test {
+   const char *descr;
+   const char *str_sec;
+   __u32 raw_types[MAX_NR_RAW_TYPES];
+   __u32 str_sec_size;
+   struct bpf_insn insns[MAX_INSNS];
+   __u32 prog_type;
+   __u32 func_info[MAX_SUBPROGS][2];
+   __u32 func_info_rec_size;
+   __u32 func_info_cnt;
+   bool expected_prog_load_failure;
+} func_type_test[] = {
+{
+   .descr = "func_type (main func + one sub)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),   /* [1] 
*/
+   BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),/* [2] */
+   BTF_FUNC_PROTO_ENC(1, 2),   /* [3] */
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+   BTF_FUNC_PROTO_ENC(1, 2),   /* [4] */
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+   BTF_FUNC_ENC(NAME_TBD, 3),  /* [5] */
+   BTF_FUNC_ENC(NAME_TBD, 4),  /* [6] */
+   BTF_END_RAW,
+   },
+   .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+   .str_sec_size = sizeof("\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB"),
+   .insns = {
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+   BPF_MOV64_IMM(BPF_REG_0, 1),
+   BPF_EXIT_INSN(),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+   .func_info = { {0, 5}, {3, 6} },
+   .func_info_rec_size = 8,
+   .func_info_cnt = 2,
+},
+
+{
+   .descr = "func_type (Incorrect func_info_rec_size)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),   /* [1] 
*/
+   BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),/* [2] */
+   BTF_FUNC_PROTO_ENC(1, 2),   /* [3] */
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+   BTF_FUNC_PROTO_ENC(1, 2),   /* [4] */
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+   BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+   BTF_FUNC_ENC(NAME_TBD, 3),  /* [5] */
+   BTF_FUNC_ENC(NAME_TBD, 4),  /* [6] */
+   BTF_END_RAW,
+   },
+   .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+   .str_sec_size = sizeof(&

[PATCH v5 bpf-next 05/13] bpf: Introduce bpf_func_info

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

This patch added interface to load a program with the following
additional information:
   . prog_btf_fd
   . func_info, func_info_rec_size and func_info_cnt
where func_info will provide function range and type_id
corresponding to each function.

The func_info_rec_size is introduced in the UAPI to specify
struct bpf_func_info size passed from user space. This
intends to make bpf_func_info structure growable in the future.
If the kernel gets a different bpf_func_info size from userspace,
it will try to handle user request with part of bpf_func_info
it can understand. In this patch, kernel can understand
  struct bpf_func_info {
   __u32   insn_offset;
   __u32   type_id;
  };
If user passed a bpf func_info record size of 16 bytes, the
kernel can still handle part of records with the above definition.

If verifier agrees with function range provided by the user,
the bpf_prog ksym for each function will use the func name
provided in the type_id, which is supposed to provide better
encoding as it is not limited by 16 bytes program name
limitation and this is better for bpf program which contains
multiple subprograms.

The bpf_prog_info interface is also extended to
return btf_id, func_info, func_info_rec_size and func_info_cnt
to userspace, so userspace can print out the function prototype
for each xlated function. The insn_offset in the returned
func_info corresponds to the insn offset for xlated functions.
With other jit related fields in bpf_prog_info, userspace can also
print out function prototypes for each jited function.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 include/linux/bpf.h  |   5 +-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h  |   2 +
 include/uapi/linux/bpf.h |  13 
 kernel/bpf/btf.c |   4 +-
 kernel/bpf/core.c|  13 
 kernel/bpf/syscall.c |  59 +++--
 kernel/bpf/verifier.c| 120 ++-
 8 files changed, 209 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 987815152629..7f0e225bf630 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -316,6 +316,8 @@ struct bpf_prog_aux {
void *security;
 #endif
struct bpf_prog_offload *offload;
+   struct btf *btf;
+   u32 type_id; /* type id for this prog/func */
union {
struct work_struct work;
struct rcu_head rcu;
@@ -527,7 +529,8 @@ static inline void bpf_long_memcpy(void *dst, const void 
*src, u32 size)
 }
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr,
+ union bpf_attr __user *uattr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..204382f46fd8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -204,6 +204,7 @@ static inline bool bpf_verifier_log_needed(const struct 
bpf_verifier_log *log)
 struct bpf_subprog_info {
u32 start; /* insn idx of function entry point */
u16 stack_depth; /* max. stack depth used by this function */
+   u32 type_id; /* btf type_id for this subprog */
 };
 
 /* single container for all structs
diff --git a/include/linux/btf.h b/include/linux/btf.h
index e076c4697049..7f2c0a4a45ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, 
void *obj,
   struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
+const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 47d606d744cc..e7a1930b6622 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -335,6 +335,10 @@ union bpf_attr {
 * (context accesses, allowed helpers, etc).
 */
__u32   expected_attach_type;
+   __u32   prog_btf_fd;/* fd pointing to BTF type data 
*/
+   __u32   func_info_rec_size; /* userspace 
bpf_func_info size */
+   __aligned_u64   func_info;  /* func info */
+   __u32   func_info_cnt;  /* number of bpf_func_info 
records */
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2635,6 +2639,10 @@ struct bpf_prog_info {
__u32 nr_jited_func_lens;
__aligned_u64 jited_ksyms;
__aligned_u64 jited_func_lens;
+   __u32 btf_id;
+   __u32 func_info_rec_size;
+   __aligned_u64 func_info;
+   __u32 func_info_cnt;
 } __attribute__((aligned(8)));
 
 struct

[PATCH v5 bpf-next 06/13] tools/bpf: sync kernel uapi bpf.h header to tools directory

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

The kernel uapi bpf.h is synced to tools directory.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/include/uapi/linux/bpf.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 852dc17ab47a..28db552a1eed 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -335,6 +335,10 @@ union bpf_attr {
 * (context accesses, allowed helpers, etc).
 */
__u32   expected_attach_type;
+   __u32   prog_btf_fd;/* fd pointing to BTF type data 
*/
+   __u32   func_info_rec_size; /* userspace 
bpf_func_info size */
+   __aligned_u64   func_info;  /* func info */
+   __u32   func_info_cnt;  /* number of bpf_func_info 
records */
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2631,6 +2635,10 @@ struct bpf_prog_info {
__u32 nr_jited_func_lens;
__aligned_u64 jited_ksyms;
__aligned_u64 jited_func_lens;
+   __u32 btf_id;
+   __u32 func_info_rec_size;
+   __aligned_u64 func_info;
+   __u32 func_info_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2942,4 +2950,9 @@ struct bpf_flow_keys {
};
 };
 
+struct bpf_func_info {
+   __u32   insn_offset;
+   __u32   type_id;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
2.17.1



[PATCH v5 bpf-next 07/13] tools/bpf: add new fields for program load in lib/bpf

2018-11-19 Thread Martin KaFai Lau
From: Yonghong Song 

The new fields are added for program load in lib/bpf so
application uses api bpf_load_program_xattr() is able
to load program with btf and func_info data.

This functionality will be used in next patch
by bpf selftest test_btf.

Signed-off-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/bpf.c | 4 
 tools/lib/bpf/bpf.h | 4 
 2 files changed, 8 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 03f9bcc4ef50..06fc5e91ac51 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -196,6 +196,10 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
attr.log_level = 0;
attr.kern_version = load_attr->kern_version;
attr.prog_ifindex = load_attr->prog_ifindex;
+   attr.prog_btf_fd = load_attr->prog_btf_fd;
+   attr.func_info_rec_size = load_attr->func_info_rec_size;
+   attr.func_info_cnt = load_attr->func_info_cnt;
+   attr.func_info = ptr_to_u64(load_attr->func_info);
memcpy(attr.prog_name, load_attr->name,
   min(name_len, BPF_OBJ_NAME_LEN - 1));
 
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 26a51538213c..8bdfd806253a 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -74,6 +74,10 @@ struct bpf_load_program_attr {
const char *license;
__u32 kern_version;
__u32 prog_ifindex;
+   __u32 prog_btf_fd;
+   __u32 func_info_rec_size;
+   const void *func_info;
+   __u32 func_info_cnt;
 };
 
 /* Flags to direct loading requirements */
-- 
2.17.1



[PATCH v5 bpf-next 01/13] bpf: btf: Break up btf_type_is_void()

2018-11-19 Thread Martin KaFai Lau
This patch breaks up btf_type_is_void() into
btf_type_is_void() and btf_type_is_fwd().

It also adds btf_type_nosize() to better describe it is
testing a type has nosize info.

Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/btf.c | 37 ++---
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee4c82667d65..2a50d87de485 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -306,15 +306,22 @@ static bool btf_type_is_modifier(const struct btf_type *t)
 
 static bool btf_type_is_void(const struct btf_type *t)
 {
-   /* void => no type and size info.
-* Hence, FWD is also treated as void.
-*/
-   return t == _void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+   return t == _void;
+}
+
+static bool btf_type_is_fwd(const struct btf_type *t)
+{
+   return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+   return btf_type_is_void(t) || btf_type_is_fwd(t);
 }
 
-static bool btf_type_is_void_or_null(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
 {
-   return !t || btf_type_is_void(t);
+   return !t || btf_type_nosize(t);
 }
 
 /* union is only a special case of struct:
@@ -826,7 +833,7 @@ const struct btf_type *btf_type_id_size(const struct btf 
*btf,
u32 size = 0;
 
size_type = btf_type_by_id(btf, size_type_id);
-   if (btf_type_is_void_or_null(size_type))
+   if (btf_type_nosize_or_null(size_type))
return NULL;
 
if (btf_type_has_size(size_type)) {
@@ -842,7 +849,7 @@ const struct btf_type *btf_type_id_size(const struct btf 
*btf,
size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
-   if (btf_type_is_void(size_type))
+   if (btf_type_nosize_or_null(size_type))
return NULL;
}
 
@@ -1164,7 +1171,7 @@ static int btf_modifier_resolve(struct btf_verifier_env 
*env,
}
 
/* "typedef void new_void", "const void"...etc */
-   if (btf_type_is_void(next_type))
+   if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
goto resolved;
 
if (!env_type_is_resolve_sink(env, next_type) &&
@@ -1178,7 +1185,7 @@ static int btf_modifier_resolve(struct btf_verifier_env 
*env,
 * pretty print).
 */
if (!btf_type_id_size(btf, _type_id, _type_size) &&
-   !btf_type_is_void(btf_type_id_resolve(btf, _type_id))) {
+   !btf_type_nosize(btf_type_id_resolve(btf, _type_id))) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1205,7 +1212,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
}
 
/* "void *" */
-   if (btf_type_is_void(next_type))
+   if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
goto resolved;
 
if (!env_type_is_resolve_sink(env, next_type) &&
@@ -1235,7 +1242,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
}
 
if (!btf_type_id_size(btf, _type_id, _type_size) &&
-   !btf_type_is_void(btf_type_id_resolve(btf, _type_id))) {
+   !btf_type_nosize(btf_type_id_resolve(btf, _type_id))) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1396,7 +1403,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
-   if (btf_type_is_void_or_null(index_type)) {
+   if (btf_type_nosize_or_null(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1415,7 +1422,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
-   if (btf_type_is_void_or_null(elem_type)) {
+   if (btf_type_nosize_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
  "Invalid elem");
return -EINVAL;
@@ -1615,7 +1622,7 @@ static int btf_struct_resolve(struct btf_verifier_env 
*env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
 
-   if (btf_type_is_void_or_null(member_type)) {
+   if (btf_type_nosize_or_null(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
-- 
2.17.1



[PATCH bpf-next] bpf: libbpf: Fix bpf_program__next() API

2018-11-12 Thread Martin KaFai Lau
This patch restores the behavior in
commit eac7d84519a3 ("tools: libbpf: don't return '.text' as a program for 
multi-function programs")
such that bpf_program__next() does not return pseudo programs in ".text".

Fixes: 0c19a9fbc9cd ("libbpf: cleanup after partial failure in bpf_object__pin")
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/libbpf.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e827542ffa3a..a01eb9584e52 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2193,19 +2193,25 @@ void *bpf_object__priv(struct bpf_object *obj)
 }
 
 static struct bpf_program *
-__bpf_program__iter(struct bpf_program *p, struct bpf_object *obj, int i)
+__bpf_program__iter(struct bpf_program *p, struct bpf_object *obj, bool 
forward)
 {
+   size_t nr_programs = obj->nr_programs;
ssize_t idx;
 
-   if (!obj->programs)
+   if (!nr_programs)
return NULL;
 
+   if (!p)
+   /* Iter from the beginning */
+   return forward ? >programs[0] :
+   >programs[nr_programs - 1];
+
if (p->obj != obj) {
pr_warning("error: program handler doesn't match object\n");
return NULL;
}
 
-   idx = (p - obj->programs) + i;
+   idx = (p - obj->programs) + (forward ? 1 : -1);
if (idx >= obj->nr_programs || idx < 0)
return NULL;
return >programs[idx];
@@ -2216,11 +,8 @@ bpf_program__next(struct bpf_program *prev, struct 
bpf_object *obj)
 {
struct bpf_program *prog = prev;
 
-   if (prev == NULL)
-   return obj->programs;
-
do {
-   prog = __bpf_program__iter(prog, obj, 1);
+   prog = __bpf_program__iter(prog, obj, true);
} while (prog && bpf_program__is_function_storage(prog, obj));
 
return prog;
@@ -2231,14 +2234,8 @@ bpf_program__prev(struct bpf_program *next, struct 
bpf_object *obj)
 {
struct bpf_program *prog = next;
 
-   if (next == NULL) {
-   if (!obj->nr_programs)
-   return NULL;
-   return obj->programs + obj->nr_programs - 1;
-   }
-
do {
-   prog = __bpf_program__iter(prog, obj, -1);
+   prog = __bpf_program__iter(prog, obj, false);
} while (prog && bpf_program__is_function_storage(prog, obj));
 
return prog;
-- 
2.17.1



[PATCH bpf] bpf: btf: Fix end boundary calculation for type section

2018-09-12 Thread Martin KaFai Lau
The end boundary math for type section is incorrect in
btf_check_all_metas().  It just happens that hdr->type_off
is always 0 for now because there are only two sections
(type and string) and string section must be at the end (ensured
in btf_parse_str_sec).

However, type_off may not be 0 if a new section would be added later.
This patch fixes it.

Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header")
Reported-by: Dmitry Vyukov 
Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2590700237c1..138f0302692e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env 
*env)
 
hdr = >hdr;
cur = btf->nohdr_data + hdr->type_off;
-   end = btf->nohdr_data + hdr->type_len;
+   end = cur + hdr->type_len;
 
env->log_type_id = 1;
while (cur < end) {
-- 
2.17.1



Re: bpf: btf: Change how section is supported in btf_header

2018-09-11 Thread Martin KaFai Lau
On Tue, Sep 11, 2018 at 06:40:05PM +0200, Dmitry Vyukov wrote:
> Hi Martin,
> 
> I am looking at the subj commit:
> 
>  static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
> @@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct
> btf_verifier_env *env)
> struct btf_header *hdr;
> void *cur, *end;
> 
> -   hdr = btf->hdr;
> +   hdr = >hdr;
> cur = btf->nohdr_data + hdr->type_off;
> -   end = btf->nohdr_data + hdr->str_off;
> +   end = btf->nohdr_data + hdr->type_len;
> 
> Shouldn't this be:
> 
> +   end = cur + hdr->type_len;
> 
> ? Or otherwise I am having trouble understanding meaning of fields.
You are correct.  Thanks for pointing this out.
Do you want to post an offical patch for the bpf branch?

> 
> On a related note, what's between header and type_off? Is type_off
> supposed to be 0 always?
type section is always the first section for now (i.e. immediately after
the header).  Some other sections could be introduced later and it could
be located before the type section such that the type_off will not be 0.


Re: [PATCH bpf-next] bpf: enable btf for use in all maps

2018-08-10 Thread Martin KaFai Lau
On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> > On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>  On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> > On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >> the basic arraymap") enabled support for BTF and dumping via
> >> BPF fs for arraymap. However, both can be decoupled from each
> >> other such that all BPF maps can be supported for attaching
> >> BTF key/value information, while not all maps necessarily
> >> need to dump via map_seq_show_elem() callback.
> >>
> >> The check in array_map_check_btf() can be generalized as
> >> ultimatively the key and value size is the only contraint
> >> that needs to match for the map. The fact that the key needs
> >> to be of type int is optional; it could be any data type as
> >> long as it matches the 4 byte key size, just like hash table
> >> key or others could be of any data type as well.
> >>
> >> Minimal example of a hash table dump which then works out
> >> of the box for bpftool:
> >>
> >>   # bpftool map dump id 19
> >>   [{
> >>   "key": {
> >>   "": {
> >>   "vip": 0,
> >>   "vipv6": []
> >>   },
> >>   "port": 0,
> >>   "family": 0,
> >>   "proto": 0
> >>   },
> >>   "value": {
> >>   "flags": 0,
> >>   "vip_num": 0
> >>   }
> >>   }
> >>   ]
> >>
> >> Signed-off-by: Daniel Borkmann 
> >> Cc: Yonghong Song 
> >> ---
> >>  include/linux/bpf.h   |  4 +---
> >>  kernel/bpf/arraymap.c | 27 ---
> >>  kernel/bpf/inode.c|  3 ++-
> >>  kernel/bpf/syscall.c  | 24 
> >>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index cd8790d..eb76e8e 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>  struct seq_file *m);
> >> -  int (*map_check_btf)(const struct bpf_map *map, const struct 
> >> btf *btf,
> >> -   u32 key_type_id, u32 value_type_id);
> >>  };
> >>  
> >>  struct bpf_map {
> >> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const 
> >> struct bpf_map *map)
> >>  
> >>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>  {
> >> -  return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >> +  return map->btf && map->ops->map_seq_show_elem;
> >>  }
> >>  
> >>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >> index 2aa55d030..67f0bdf 100644
> >> --- a/kernel/bpf/arraymap.c
> >> +++ b/kernel/bpf/arraymap.c
> >> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct 
> >> bpf_map *map, void *key,
> >>rcu_read_unlock();
> >>  }
> >>  
> >> -static int array_map_check_btf(const struct bpf_map *map, const 
> >> struct btf *btf,
> >> - u32 btf_key_id, u32 btf_value_id)
> >> -{
> >> -  const struct btf_type *key_type, *value_type;
> >> -  u32 key_size, value_size;
> >> -  u32 int_data;
> >> -
> >> -  key_type = btf_type_id_size(btf, _key_id, _size);
> >> -  if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >> -  return -EINVAL;
> >> -
> >> -  int_data = *(u32 *)(key_type + 1);
> >> -  /* bpf array can only take a u32 key.  This check makes
> >> -   * sure that the btf matches the attr used during map_create.
> >> -   */
> >> -  if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >> -  BTF_INT_OFFSET(int_data))
> >> -  return -EINVAL;
> >
> > I think most of these checks are still necessary for array type.
> > Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> > is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't 
> > makes sense.
> 
>  Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>  on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>  could be 

[PATCH bpf-next 9/9] bpf: Test BPF_PROG_TYPE_SK_REUSEPORT

2018-08-08 Thread Martin KaFai Lau
This patch add tests for the new BPF_PROG_TYPE_SK_REUSEPORT.

The tests cover:
- IPv4/IPv6 + TCP/UDP
- TCP syncookie
- TCP fastopen
- Cases when the bpf_sk_select_reuseport() returning errors
- Cases when the bpf prog returns SK_DROP
- Values from sk_reuseport_md
- outer_map => reuseport_array

The test depends on
commit 3eee1f75f2b9 ("bpf: fix bpf_skb_load_bytes_relative pkt length check")

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 tools/lib/bpf/bpf.c   |   1 +
 tools/lib/bpf/bpf.h   |   1 +
 tools/testing/selftests/bpf/Makefile  |   4 +-
 tools/testing/selftests/bpf/bpf_helpers.h |   4 +
 .../selftests/bpf/test_select_reuseport.c | 688 ++
 .../bpf/test_select_reuseport_common.h|  36 +
 .../bpf/test_select_reuseport_kern.c  | 180 +
 7 files changed, 912 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport.c
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport_common.h
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport_kern.c

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9ddc89dae962..60aa4ca8b2c5 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -92,6 +92,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr 
*create_attr)
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
+   attr.inner_map_fd = create_attr->inner_map_fd;
 
return sys_bpf(BPF_MAP_CREATE, , sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0639a30a457d..6f38164b2618 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -39,6 +39,7 @@ struct bpf_create_map_attr {
__u32 btf_key_type_id;
__u32 btf_value_type_id;
__u32 map_ifindex;
+   __u32 inner_map_fd;
 };
 
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index ad241ddba350..6ffd6c5988fb 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs \
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user 
\
-   test_socket_cookie test_cgroup_storage
+   test_socket_cookie test_cgroup_storage test_select_reuseport
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o 
test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o 
\
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o 
test_lirc_mode2_kern.o \
-   get_cgroup_id_kern.o socket_cookie_prog.o
+   get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index cb9fcfbc9307..64eec58854b0 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -111,6 +111,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 int size, int flags) =
(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 
flags) =
+   (void *) BPF_FUNC_sk_select_reuseport;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
(void *) BPF_FUNC_get_stack;
 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
@@ -164,6 +166,8 @@ struct bpf_map_def {
 
 static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
(void *) BPF_FUNC_skb_load_bytes;
+static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int 
len, __u32 start_header) =
+   (void *) BPF_FUNC_skb_load_bytes_relative;
 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int 
flags) =
(void *) BPF_FUNC_skb_store_bytes;
 static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int 
flags) =
diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c 
b/tools/testing/selftests/bpf/test_select_reuseport.c
new file mode 100644
index ..75646d9b34aa

[PATCH bpf-next 4/9] bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT

2018-08-08 Thread Martin KaFai Lau
This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY.  Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.

BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].

At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp).  At this
point,  it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[].  Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp.  It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.

For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb().  Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.

Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.

The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations.  There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).

In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages.  Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb.  The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.

The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run.  It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").

The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk.  It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY.  As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()".  Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case).  The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want.  This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match).  The bpf prog can decide if it wants to do SK_DROP as its
discretion.

When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk.  If not, the kernel
will select one from "reuse->socks[]" (as before this patch).

The SK_DROP and SK_PASS handling logic will be in the next patch.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf_types.h   |   3 +
 include/linux/filter.h  |  15 ++
 include/net/addrconf.h  |   1 +
 include/net/sock_reuseport.h|   6 +-
 include/uapi/linux/bpf.h|  36 -
 kernel/bpf/verifier.c   |   9 ++
 net/core/filter.c   | 269 +++-
 net/core/sock_reuseport.c   |  20 ++-
 net/ipv4/inet_connection_sock.c |   9 ++
 net/ipv4/inet_hashtables.c  |   5 +-
 net/ipv4/udp.c  |   5 +-
 11 files changed, 365 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 14fd6c02d258..cd26c090e7c0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #endif
+#ifdef CONFIG_INET
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c73dd7396886..29577c6f3289 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -32,6 +32,7 @@ struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
 struct xdp_buff;
+struct sock_reuseport;
 
 

[PATCH bpf-next 2/9] net: Add ID (if needed) to sock_reuseport and expose reuseport_lock

2018-08-08 Thread Martin KaFai Lau
A later patch will introduce a BPF_MAP_TYPE_REUSEPORT_ARRAY which
allows a SO_REUSEPORT sk to be added to a bpf map.  When a sk
is removed from reuse->socks[], it also needs to be removed from
the bpf map.  Also, when adding a sk to a bpf map, the bpf
map needs to ensure it is indeed in a reuse->socks[].
Hence, reuseport_lock is needed by the bpf map to ensure its
map_update_elem() and map_delete_elem() operations are in-sync with
the reuse->socks[].  The BPF_MAP_TYPE_REUSEPORT_ARRAY map will only
acquire the reuseport_lock after ensuring the adding sk is already
in a reuseport group (i.e. reuse->socks[]).  The map_lookup_elem()
will be lockless.

This patch also adds an ID to sock_reuseport.  A later patch
will introduce BPF_PROG_TYPE_SK_REUSEPORT which allows
a bpf prog to select a sk from a bpf map.  It is inflexible to
statically enforce a bpf map can only contain the sk belonging to
a particular reuse->socks[] (i.e. same IP:PORT) during the bpf
verification time. For example, think about the the map-in-map situation
where the inner map can be dynamically changed in runtime and the outer
map may have inner maps belonging to different reuseport groups.
Hence, when the bpf prog (in the new BPF_PROG_TYPE_SK_REUSEPORT
type) selects a sk,  this selected sk has to be checked to ensure it
belongs to the requesting reuseport group (i.e. the group serving
that IP:PORT).

The "sk->sk_reuseport_cb" pointer cannot be used for this checking
purpose because the pointer value will change after reuseport_grow().
Instead of saving all checking conditions like the ones
preced calling "reuseport_add_sock()" and compare them everytime a
bpf_prog is run, a 32bits ID is introduced to survive the
reuseport_grow().  The ID is only acquired if any of the
reuse->socks[] is added to the newly introduced
"BPF_MAP_TYPE_REUSEPORT_ARRAY" map.

If "BPF_MAP_TYPE_REUSEPORT_ARRAY" is not used,  the changes in this
patch is a no-op.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 include/net/sock_reuseport.h |  6 ++
 net/core/sock_reuseport.c| 27 ++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 6bef7a0052f2..e1a7681856f7 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -5,8 +5,11 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
+extern spinlock_t reuseport_lock;
+
 struct sock_reuseport {
struct rcu_head rcu;
 
@@ -16,6 +19,8 @@ struct sock_reuseport {
 * reuse->socks[] group.
 */
unsigned intsynq_overflow_ts;
+   /* ID stays the same even after the size of socks[] grows. */
+   unsigned intreuseport_id;
struct bpf_prog __rcu   *prog;  /* optional BPF sock selector */
struct sock *socks[0];  /* array of sock pointers */
 };
@@ -29,5 +34,6 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
  int hdr_len);
 extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
  struct bpf_prog *prog);
+int reuseport_get_id(struct sock_reuseport *reuse);
 
 #endif  /* _SOCK_REUSEPORT_H */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 3f188fad0162..cf2e4d305af9 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -8,11 +8,33 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #define INIT_SOCKS 128
 
-static DEFINE_SPINLOCK(reuseport_lock);
+DEFINE_SPINLOCK(reuseport_lock);
+
+#define REUSEPORT_MIN_ID 1
+static DEFINE_IDA(reuseport_ida);
+
+int reuseport_get_id(struct sock_reuseport *reuse)
+{
+   int id;
+
+   if (reuse->reuseport_id)
+   return reuse->reuseport_id;
+
+   id = ida_simple_get(_ida, REUSEPORT_MIN_ID, 0,
+   /* Called under reuseport_lock */
+   GFP_ATOMIC);
+   if (id < 0)
+   return id;
+
+   reuse->reuseport_id = id;
+
+   return reuse->reuseport_id;
+}
 
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
@@ -78,6 +100,7 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
+   more_reuse->reuseport_id = reuse->reuseport_id;
 
memcpy(more_reuse->socks, reuse->socks,
   reuse->num_socks * sizeof(struct sock *));
@@ -102,6 +125,8 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu);
if (reuse->prog)
bpf_prog_destroy(reuse->prog);
+   if (reuse->reuseport_id)
+   ida_simple_remove(_ida, reuse->reuseport_id);
kfree(reuse);
 }
 
-- 
2.17.1



[PATCH bpf-next 8/9] bpf: test BPF_MAP_TYPE_REUSEPORT_SOCKARRAY

2018-08-08 Thread Martin KaFai Lau
This patch adds tests for the new BPF_MAP_TYPE_REUSEPORT_SOCKARRAY.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 tools/lib/bpf/libbpf.c  |   1 +
 tools/testing/selftests/bpf/test_maps.c | 262 +++-
 2 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 79fc7ed6995a..07d961e6ecab 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1496,6 +1496,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type 
type)
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_LIRC_MODE2:
+   case BPF_PROG_TYPE_SK_REUSEPORT:
return false;
case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_KPROBE:
diff --git a/tools/testing/selftests/bpf/test_maps.c 
b/tools/testing/selftests/bpf/test_maps.c
index 6c253343a6f9..4b7c74f5faa7 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -17,7 +17,8 @@
 #include 
 
 #include 
-
+#include 
+#include 
 #include 
 
 #include 
@@ -26,8 +27,21 @@
 #include "bpf_util.h"
 #include "bpf_rlimit.h"
 
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
+
 static int map_flags;
 
+#define CHECK(condition, tag, format...) ({\
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
+   printf(format); \
+   exit(-1);   \
+   }   \
+})
+
 static void test_hashmap(int task, void *data)
 {
long long key, next_key, first_key, value;
@@ -1150,6 +1164,250 @@ static void test_map_wronly(void)
assert(bpf_map_get_next_key(fd, , ) == -1 && errno == EPERM);
 }
 
+static void prepare_reuseport_grp(int type, int map_fd,
+ __s64 *fds64, __u64 *sk_cookies,
+ unsigned int n)
+{
+   socklen_t optlen, addrlen;
+   struct sockaddr_in6 s6;
+   const __u32 index0 = 0;
+   const int optval = 1;
+   unsigned int i;
+   u64 sk_cookie;
+   __s64 fd64;
+   int err;
+
+   s6.sin6_family = AF_INET6;
+   s6.sin6_addr = in6addr_any;
+   s6.sin6_port = 0;
+   addrlen = sizeof(s6);
+   optlen = sizeof(sk_cookie);
+
+   for (i = 0; i < n; i++) {
+   fd64 = socket(AF_INET6, type, 0);
+   CHECK(fd64 == -1, "socket()",
+ "sock_type:%d fd64:%lld errno:%d\n",
+ type, fd64, errno);
+
+   err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT,
+, sizeof(optval));
+   CHECK(err == -1, "setsockopt(SO_REUSEEPORT)",
+ "err:%d errno:%d\n", err, errno);
+
+   /* reuseport_array does not allow unbound sk */
+   err = bpf_map_update_elem(map_fd, , ,
+ BPF_ANY);
+   CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update unbound sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+   err = bind(fd64, (struct sockaddr *), sizeof(s6));
+   CHECK(err == -1, "bind()",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+   if (i == 0) {
+   err = getsockname(fd64, (struct sockaddr *),
+ );
+   CHECK(err == -1, "getsockname()",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+   }
+
+   err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, _cookie,
+);
+   CHECK(err == -1, "getsockopt(SO_COOKIE)",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+   if (type == SOCK_STREAM) {
+   /*
+* reuseport_array does not allow
+* non-listening tcp sk.
+*/
+   err = bpf_map_update_elem(map_fd, , ,
+ BPF_ANY);
+   CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update non-listening sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+   err = 

[PATCH bpf-next 3/9] bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY

2018-08-08 Thread Martin KaFai Lau
This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY.

To unleash the full potential of a bpf prog, it is essential for the
userspace to be capable of directly setting up a bpf map which can then
be consumed by the bpf prog to make decision.  In this case, decide which
SO_REUSEPORT sk to serve the incoming request.

By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
the bpf prog can directly select a sk from the bpf map.  That will
raise the programmability of the bpf prog attached to a reuseport
group (a group of sk serving the same IP:PORT).

For example, in UDP, the bpf prog can peek into the payload (e.g.
through the "data" pointer introduced in the later patch) to learn
the application level's connection information and then decide which sk
to pick from a bpf map.  The userspace can tightly couple the sk's location
in a bpf map with the application logic in generating the UDP payload's
connection information.  This connection info contact/API stays within the
userspace.

Also, when used with map-in-map, the userspace can switch the
old-server-process's inner map to a new-server-process's inner map
in one call "bpf_map_update_elem(outer_map, , _reuseport_array)".
The bpf prog will then direct incoming requests to the new process instead
of the old process.  The old process can finish draining the pending
requests (e.g. by "accept()") before closing the old-fds.  [Note that
deleting a fd from a bpf map does not necessary mean the fd is closed]

During map_update_elem(),
Only SO_REUSEPORT sk (i.e. which has already been added
to a reuse->socks[]) can be used.  That means a SO_REUSEPORT sk that is
"bind()" for UDP or "bind()+listen()" for TCP.  These conditions are
ensured in "reuseport_array_update_check()".

A SO_REUSEPORT sk can only be added once to a map (i.e. the
same sk cannot be added twice even to the same map).  SO_REUSEPORT
already allows another sk to be created for the same IP:PORT.
There is no need to re-create a similar usage in the BPF side.

When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"),
it will notify the bpf map to remove it from the map also.  It is
done through "bpf_sk_reuseport_detach()" and it will only be called
if >=1 of the "reuse->sock[]" has ever been added to a bpf map.

The map_update()/map_delete() has to be in-sync with the
"reuse->socks[]".  Hence, the same "reuseport_lock" used
by "reuse->socks[]" has to be used here also. Care has
been taken to ensure the lock is only acquired when the
adding sk passes some strict tests. and
freeing the map does not require the reuseport_lock.

The reuseport_array will also support lookup from the syscall
side.  It will return a sock_gen_cookie().  The sock_gen_cookie()
is on-demand (i.e. a sk's cookie is not generated until the very
first map_lookup_elem()).

The lookup cookie is 64bits but it goes against the logical userspace
expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also).
It may catch user in surprise if we enforce value_size=8 while
userspace still pass a 32bits fd during update.  Supporting different
value_size between lookup and update seems unintuitive also.

We also need to consider what if other existing fd based maps want
to return 64bits value from syscall's lookup in the future.
Hence, reuseport_array supports both value_size 4 and 8, and
assuming user will usually use value_size=4.  The syscall's lookup
will return ENOSPC on value_size=4.  It will will only
return 64bits value from sock_gen_cookie() when user consciously
choose value_size=8 (as a signal that lookup is desired) which then
requires a 64bits value in both lookup and update.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf.h  |  28 +++
 include/linux/bpf_types.h|   3 +
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/Makefile  |   3 +
 kernel/bpf/arraymap.c|   2 +-
 kernel/bpf/reuseport_array.c | 363 +++
 kernel/bpf/syscall.c |   6 +
 net/core/sock_reuseport.c|   8 +
 8 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/reuseport_array.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..db11662faea6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union 
bpf_attr *attr)
 }
 
 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type 
type);
+int array_map_alloc_check(union bpf_attr *attr);
 
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -769,6 +770,33 @@ static inline void __xs

[PATCH bpf-next 6/9] bpf: Refactor ARRAY_SIZE macro to bpf_util.h

2018-08-08 Thread Martin KaFai Lau
This patch refactors the ARRAY_SIZE macro to bpf_util.h.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 tools/testing/selftests/bpf/bpf_util.h   | 4 
 tools/testing/selftests/bpf/test_align.c | 5 +
 tools/testing/selftests/bpf/test_btf.c   | 5 +
 tools/testing/selftests/bpf/test_sock.c  | 5 +
 tools/testing/selftests/bpf/test_sock_addr.c | 5 +
 tools/testing/selftests/bpf/test_verifier.c  | 5 +
 6 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/bpf/bpf_util.h 
b/tools/testing/selftests/bpf/bpf_util.h
index d0811b3d6a6f..315a44fa32af 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -44,4 +44,8 @@ static inline unsigned int bpf_num_possible_cpus(void)
name[bpf_num_possible_cpus()]
 #define bpf_percpu(name, cpu) name[(cpu)].v
 
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
 #endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/test_align.c 
b/tools/testing/selftests/bpf/test_align.c
index 6b1b302310fe..5f377ec53f2f 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -18,10 +18,7 @@
 
 #include "../../../include/linux/filter.h"
 #include "bpf_rlimit.h"
-
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
+#include "bpf_util.h"
 
 #define MAX_INSNS  512
 #define MAX_MATCHES16
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 3619f3023088..a5688b66a926 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -19,6 +19,7 @@
 #include 
 
 #include "bpf_rlimit.h"
+#include "bpf_util.h"
 
 static uint32_t pass_cnt;
 static uint32_t error_cnt;
@@ -93,10 +94,6 @@ static int __base_pr(const char *format, ...)
 #define MAX_NR_RAW_TYPES 1024
 #define BTF_LOG_BUF_SIZE 65535
 
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 static struct args {
unsigned int raw_test_num;
unsigned int file_test_num;
diff --git a/tools/testing/selftests/bpf/test_sock.c 
b/tools/testing/selftests/bpf/test_sock.c
index f4d99fabc56d..b8ebe2f58074 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -14,10 +14,7 @@
 
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
-
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
+#include "bpf_util.h"
 
 #define CG_PATH"/foo"
 #define MAX_INSNS  512
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c 
b/tools/testing/selftests/bpf/test_sock_addr.c
index 2e45c92d..aeeb76a54d63 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -20,15 +20,12 @@
 
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
+#include "bpf_util.h"
 
 #ifndef ENOTSUPP
 # define ENOTSUPP 524
 #endif
 
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 #define CG_PATH"/foo"
 #define CONNECT4_PROG_PATH "./connect4_prog.o"
 #define CONNECT6_PROG_PATH "./connect6_prog.o"
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index ac281ee771dd..5ab696d2fb10 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -42,12 +42,9 @@
 #endif
 #include "bpf_rlimit.h"
 #include "bpf_rand.h"
+#include "bpf_util.h"
 #include "../../../include/linux/filter.h"
 
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 #define MAX_INSNS  BPF_MAXINSNS
 #define MAX_FIXUPS 8
 #define MAX_NR_MAPS8
-- 
2.17.1



[PATCH bpf-next 5/9] bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection

2018-08-08 Thread Martin KaFai Lau
This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a
SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in
the earlier patch.  "bpf_run_sk_reuseport()" will return -ECONNREFUSED
when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP.
The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to
handle this case and return NULL immediately instead of continuing the
sk search from its hashtable.

It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach
BPF_PROG_TYPE_SK_REUSEPORT.  The "sk_reuseport_attach_bpf()" will check
if the attaching bpf prog is in the new SK_REUSEPORT or the existing
SOCKET_FILTER type and then check different things accordingly.

One level of "__reuseport_attach_prog()" call is removed.  The
"sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed
back to "reuseport_attach_prog()" in sock_reuseport.c.  sock_reuseport.c
seems to have more knowledge on those test requirements than filter.c.
In "reuseport_attach_prog()", after new_prog is attached to reuse->prog,
the old_prog (if any) is also directly freed instead of returning the
old_prog to the caller and asking the caller to free.

The sysctl_optmem_max check is moved back to the
"sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()".
As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only
bounded by the usual "bpf_prog_charge_memlock()" during load time
instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max.

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 include/linux/filter.h   |  1 +
 include/net/sock_reuseport.h |  3 +-
 net/core/filter.c| 87 +---
 net/core/sock_reuseport.c| 36 ++-
 net/ipv4/inet_hashtables.c   | 14 +++---
 net/ipv4/udp.c   |  4 ++
 net/ipv6/inet6_hashtables.c  | 14 +++---
 net/ipv6/udp.c   |  4 ++
 8 files changed, 106 insertions(+), 57 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 29577c6f3289..e44c531f2002 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -739,6 +739,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock 
*sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
+void sk_reuseport_prog_free(struct bpf_prog *prog);
 int sk_detach_filter(struct sock *sk);
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
  unsigned int len);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 73b569556be6..8a5f70c7cdf2 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -34,8 +34,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
  int hdr_len);
-extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
- struct bpf_prog *prog);
+extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 int reuseport_get_id(struct sock_reuseport *reuse);
 
 #endif  /* _SOCK_REUSEPORT_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index f4c928709756..315db0a478f0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, 
struct sock *sk)
return 0;
 }
 
-static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
-{
-   struct bpf_prog *old_prog;
-   int err;
-
-   if (bpf_prog_size(prog->len) > sysctl_optmem_max)
-   return -ENOMEM;
-
-   if (sk_unhashed(sk) && sk->sk_reuseport) {
-   err = reuseport_alloc(sk, false);
-   if (err)
-   return err;
-   } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
-   /* The socket wasn't bound with SO_REUSEPORT */
-   return -EINVAL;
-   }
-
-   old_prog = reuseport_attach_prog(sk, prog);
-   if (old_prog)
-   bpf_prog_destroy(old_prog);
-
-   return 0;
-}
-
 static
 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
 {
@@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog 
*fprog, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
 
-   err = __reuseport_attach_prog(prog, sk);
-   if (err < 0) {
+   if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+   err = -ENOMEM;
+   else
+   err = reuseport_attach_prog(sk, prog);
+
+   if (err)
__bpf_prog_release(prog);
-   retur

[PATCH bpf-next 7/9] bpf: Sync bpf.h uapi to tools/

2018-08-08 Thread Martin KaFai Lau
This patch sync include/uapi/linux/bpf.h to
tools/include/uapi/linux/

Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h | 37 +-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index dd5758dc35d3..3102a2a23c31 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
+   BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 };
 
 enum bpf_prog_type {
@@ -150,6 +151,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
+   BPF_PROG_TYPE_SK_REUSEPORT,
 };
 
 enum bpf_attach_type {
@@ -2113,6 +2115,14 @@ union bpf_attr {
  * the shared data.
  * Return
  * Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map 
*map, void *key, u64 flags)
+ * Description
+ * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ * It checks the selected sk is matching the incoming
+ * request in the skb.
+ * Return
+ * 0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2196,7 +2206,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id),  \
FN(get_current_cgroup_id),  \
-   FN(get_local_storage),
+   FN(get_local_storage),  \
+   FN(sk_select_reuseport),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2413,6 +2424,30 @@ struct sk_msg_md {
__u32 local_port;   /* stored in host byte order */
 };
 
+struct sk_reuseport_md {
+   /*
+* Start of directly accessible data. It begins from
+* the tcp/udp header.
+*/
+   void *data;
+   void *data_end; /* End of directly accessible data */
+   /*
+* Total length of packet (starting from the tcp/udp header).
+* Note that the directly accessible bytes (data_end - data)
+* could be less than this "len".  Those bytes could be
+* indirectly read by a helper "bpf_skb_load_bytes()".
+*/
+   __u32 len;
+   /*
+* Eth protocol in the mac header (network byte order). e.g.
+* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+*/
+   __u32 eth_protocol;
+   __u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+   __u32 bind_inany;   /* Is sock bound to an INANY address? */
+   __u32 hash; /* A hash of the packet 4 tuples */
+};
+
 #define BPF_TAG_SIZE   8
 
 struct bpf_prog_info {
-- 
2.17.1



[PATCH bpf-next 1/9] tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket

2018-08-08 Thread Martin KaFai Lau
Although the actual cookie check "__cookie_v[46]_check()" does
not involve sk specific info, it checks whether the sk has recent
synq overflow event in "tcp_synq_no_recent_overflow()".  The
tcp_sk(sk)->rx_opt.ts_recent_stamp is updated every second
when it has sent out a syncookie (through "tcp_synq_overflow()").

The above per sk "recent synq overflow event timestamp" works well
for non SO_REUSEPORT use case.  However, it may cause random
connection request reject/discard when SO_REUSEPORT is used with
syncookie because it fails the "tcp_synq_no_recent_overflow()"
test.

When SO_REUSEPORT is used, it usually has multiple listening
socks serving TCP connection requests destinated to the same local IP:PORT.
There are cases that the TCP-ACK-COOKIE may not be received
by the same sk that sent out the syncookie.  For example,
if reuse->socks[] began with {sk0, sk1},
1) sk1 sent out syncookies and tcp_sk(sk1)->rx_opt.ts_recent_stamp
   was updated.
2) the reuse->socks[] became {sk1, sk2} later.  e.g. sk0 was first closed
   and then sk2 was added.  Here, sk2 does not have ts_recent_stamp set.
   There are other ordering that will trigger the similar situation
   below but the idea is the same.
3) When the TCP-ACK-COOKIE comes back, sk2 was selected.
   "tcp_synq_no_recent_overflow(sk2)" returns true. In this case,
   all syncookies sent by sk1 will be handled (and rejected)
   by sk2 while sk1 is still alive.

The userspace may create and remove listening SO_REUSEPORT sockets
as it sees fit.  E.g. Adding new thread (and SO_REUSEPORT sock) to handle
incoming requests, old process stopping and new process starting...etc.
With or without SO_ATTACH_REUSEPORT_[CB]BPF,
the sockets leaving and joining a reuseport group makes picking
the same sk to check the syncookie very difficult (if not impossible).

The later patches will allow bpf prog more flexibility in deciding
where a sk should be located in a bpf map and selecting a particular
SO_REUSEPORT sock as it sees fit.  e.g. Without closing any sock,
replace the whole bpf reuseport_array in one map_update() by using
map-in-map.  Getting the syncookie check working smoothly across
socks in the same "reuse->socks[]" is important.

A partial solution is to set the newly added sk's ts_recent_stamp
to the max ts_recent_stamp of a reuseport group but that will require
to iterate through reuse->socks[]  OR
pessimistically set it to "now - TCP_SYNCOOKIE_VALID" when a sk is
joining a reuseport group.  However, neither of them will solve the
existing sk getting moved around the reuse->socks[] and that
sk may not have ts_recent_stamp updated, unlikely under continuous
synflood but not impossible.

This patch opts to treat the reuseport group as a whole when
considering the last synq overflow timestamp since
they are serving the same IP:PORT from the userspace
(and BPF program) perspective.

"synq_overflow_ts" is added to "struct sock_reuseport".
The tcp_synq_overflow() and tcp_synq_no_recent_overflow()
will update/check reuse->synq_overflow_ts if the sk is
in a reuseport group.  Similar to the reuseport decision in
__inet_lookup_listener(), both sk->sk_reuseport and
sk->sk_reuseport_cb are tested for SO_REUSEPORT usage.
Update on "synq_overflow_ts" happens at roughly once
every second.

A synflood test was done with a 16 rx-queues and 16 reuseport sockets.
No meaningful performance change is observed.  Before and
after the change is ~9Mpps in IPv4.

Cc: Eric Dumazet 
Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
---
 include/net/sock_reuseport.h |  4 
 include/net/tcp.h| 30 --
 net/core/sock_reuseport.c|  1 +
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0054b3a9b923..6bef7a0052f2 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -12,6 +12,10 @@ struct sock_reuseport {
 
u16 max_socks;  /* length of socks */
u16 num_socks;  /* elements in socks */
+   /* The last synq overflow event timestamp of this
+* reuse->socks[] group.
+*/
+   unsigned intsynq_overflow_ts;
struct bpf_prog __rcu   *prog;  /* optional BPF sock selector */
struct sock *socks[0];  /* array of sock pointers */
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6e0a9b1dff3..b0587318c64d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -472,9 +473,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct 
sk_buff *skb);
  */
 static inline void tcp_synq_overflow(const struct sock *sk)
 {
-   unsigned int last_overfl

[PATCH bpf-next 0/9] Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY and BPF_PROG_TYPE_SK_REUSEPORT

2018-08-08 Thread Martin KaFai Lau
This series introduces a new map type "BPF_MAP_TYPE_REUSEPORT_SOCKARRAY"
and a new prog type BPF_PROG_TYPE_SK_REUSEPORT.

Here is a snippet from a commit message:

"To unleash the full potential of a bpf prog, it is essential for the
userspace to be capable of directly setting up a bpf map which can then
be consumed by the bpf prog to make decision.  In this case, decide which
SO_REUSEPORT sk to serve the incoming request.

By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
the bpf prog can directly select a sk from the bpf map.  That will
raise the programmability of the bpf prog attached to a reuseport
group (a group of sk serving the same IP:PORT).

For example, in UDP, the bpf prog can peek into the payload (e.g.
through the "data" pointer introduced in the later patch) to learn
the application level's connection information and then decide which sk
to pick from a bpf map.  The userspace can tightly couple the sk's location
in a bpf map with the application logic in generating the UDP payload's
connection information.  This connection info contact/API stays within the
userspace.

Also, when used with map-in-map, the userspace can switch the
old-server-process's inner map to a new-server-process's inner map
in one call "bpf_map_update_elem(outer_map, , _reuseport_array)".
The bpf prog will then direct incoming requests to the new process instead
of the old process.  The old process can finish draining the pending
requests (e.g. by "accept()") before closing the old-fds.  [Note that
deleting a fd from a bpf map does not necessary mean the fd is closed]"

Please see individual patch for details

Martin KaFai Lau (9):
  tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket
  net: Add ID (if needed) to sock_reuseport and expose reuseport_lock
  bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
  bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
  bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection
  bpf: Refactor ARRAY_SIZE macro to bpf_util.h
  bpf: Sync bpf.h uapi to tools/
  bpf: test BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
  bpf: Test BPF_PROG_TYPE_SK_REUSEPORT

 include/linux/bpf.h   |  28 +
 include/linux/bpf_types.h |   6 +
 include/linux/filter.h|  16 +
 include/net/addrconf.h|   1 +
 include/net/sock_reuseport.h  |  19 +-
 include/net/tcp.h |  30 +-
 include/uapi/linux/bpf.h  |  37 +-
 kernel/bpf/Makefile   |   3 +
 kernel/bpf/arraymap.c |   2 +-
 kernel/bpf/reuseport_array.c  | 363 +
 kernel/bpf/syscall.c  |   6 +
 kernel/bpf/verifier.c |   9 +
 net/core/filter.c | 354 -
 net/core/sock_reuseport.c |  92 ++-
 net/ipv4/inet_connection_sock.c   |   9 +
 net/ipv4/inet_hashtables.c|  19 +-
 net/ipv4/udp.c|   9 +-
 net/ipv6/inet6_hashtables.c   |  14 +-
 net/ipv6/udp.c|   4 +
 tools/include/uapi/linux/bpf.h|  37 +-
 tools/lib/bpf/bpf.c   |   1 +
 tools/lib/bpf/bpf.h   |   1 +
 tools/lib/bpf/libbpf.c|   1 +
 tools/testing/selftests/bpf/Makefile  |   4 +-
 tools/testing/selftests/bpf/bpf_helpers.h |   4 +
 tools/testing/selftests/bpf/bpf_util.h|   4 +
 tools/testing/selftests/bpf/test_align.c  |   5 +-
 tools/testing/selftests/bpf/test_btf.c|   5 +-
 tools/testing/selftests/bpf/test_maps.c   | 262 ++-
 .../selftests/bpf/test_select_reuseport.c | 688 ++
 .../bpf/test_select_reuseport_common.h|  36 +
 .../bpf/test_select_reuseport_kern.c  | 180 +
 tools/testing/selftests/bpf/test_sock.c   |   5 +-
 tools/testing/selftests/bpf/test_sock_addr.c  |   5 +-
 tools/testing/selftests/bpf/test_verifier.c   |   5 +-
 35 files changed, 2167 insertions(+), 97 deletions(-)
 create mode 100644 kernel/bpf/reuseport_array.c
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport.c
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport_common.h
 create mode 100644 tools/testing/selftests/bpf/test_select_reuseport_kern.c

-- 
2.17.1



[PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL

2018-08-05 Thread Martin KaFai Lau
This patch changes the tools/lib/bpf/btf.[ch] to LGPL which
is inline with libbpf also.

Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c | 2 +-
 tools/lib/bpf/btf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 2d270c560df3..c36a3a76986a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: LGPL-2.1
 /* Copyright (c) 2018 Facebook */
 
 #include 
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index e2a09a155f84..caac3a404dc5 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: LGPL-2.1 */
 /* Copyright (c) 2018 Facebook */
 
 #ifndef __BPF_BTF_H
-- 
2.17.1



[PATCH bpf] bpf: btf: Use exact btf value_size match in map_check_btf()

2018-07-26 Thread Martin KaFai Lau
The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects
'> map->value_size' to ensure map_seq_show_elem() will not
access things beyond an array element.

Yonghong suggested that using '!=' is a more correct
check.  The 8 bytes round_up on value_size is stored
in array->elem_size.  Hence, using '!=' on map->value_size
is a proper check.

This patch also adds new tests to check the btf array
key type and value type.  Two of these new tests verify
the btf's value_size (the change in this patch).

It also fixes two existing tests that wrongly encoded
a btf's type size (pprint_test) and the value_type_id (in one
of the raw_tests[]).  However, that do not affect these two
BTF verification tests before or after this test changes.
These two tests mainly failed at array creation time after
this patch.

Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap")
Suggested-by: Yonghong Song 
Acked-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/arraymap.c  |  2 +-
 tools/testing/selftests/bpf/test_btf.c | 86 +-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 544e58f5f642..2aa55d030c77 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, 
const struct btf *btf,
return -EINVAL;
 
value_type = btf_type_id_size(btf, _value_id, _size);
-   if (!value_type || value_size > map->value_size)
+   if (!value_type || value_size != map->value_size)
return -EINVAL;
 
return 0;
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 402c0f7cc418..ffdd27737c9e 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -507,7 +507,7 @@ static struct btf_raw_test raw_tests[] = {
.key_size = sizeof(int),
.value_size = sizeof(void *) * 4,
.key_type_id = 1,
-   .value_type_id = 4,
+   .value_type_id = 5,
.max_entries = 4,
 },
 
@@ -1292,6 +1292,88 @@ static struct btf_raw_test raw_tests[] = {
.err_str = "type != 0",
 },
 
+{
+   .descr = "arraymap invalid btf key (a bit field)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   /* 32 bit int with 32 bit offset */ /* [2] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 32, 32, 8),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 2,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf key (!= 32 bits)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   /* 16 bit int with 0 bit offset */  /* [2] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 16, 2),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 2,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf value (too small)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   /* btf_value_size < map->value_size */
+   .value_size = sizeof(__u64),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf value (too big)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   /* btf_value_size > map->value_size */
+   

Re: [PATCH bpf-next v2] bpf: add End.DT6 action to bpf_lwt_seg6_action helper

2018-07-25 Thread Martin KaFai Lau
On Wed, Jul 25, 2018 at 12:36:45PM +, Mathieu Xhonneux wrote:
> The seg6local LWT provides the End.DT6 action, which allows to
> decapsulate an outer IPv6 header containing a Segment Routing Header
> (SRH), full specification is available here:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_html_draft-2Dfilsfils-2Dspring-2Dsrv6-2Dnetwork-2Dprogramming-2D05=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=VQnoQ7LvghIj0gVEaiQSUw=xOQOjR3OUfKkBdRSeFH8x1QqbAb8VVRwECipEqCJyuw=L3YiDuRAH4hYSETfa5t_5q2BqaYKJR4d8Vqa8dqqHGo=
> 
> This patch adds this action now to the seg6local BPF
> interface. Since it is not mandatory that the inner IPv6 header also
> contains a SRH, seg6_bpf_srh_state has been extended with a pointer to
> a possible SRH of the outermost IPv6 header. This helps assessing if the
> validation must be triggered or not, and avoids some calls to
> ipv6_find_hdr.
> 
> v2: - changed true/false -> 1/0
hmmm...I thought I was asking to replace 1/0 with true/false.  More
below.

> - preempt_enable no longer called in first conditional block
> 
> Signed-off-by: Mathieu Xhonneux 
> ---
>  include/net/seg6_local.h |  4 ++-
>  net/core/filter.c| 83 
> +---
>  net/ipv6/seg6_local.c| 48 ++--
>  3 files changed, 91 insertions(+), 44 deletions(-)
> 
> diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
> index 661fd5b4d3e0..08359e2d8b35 100644
> --- a/include/net/seg6_local.h
> +++ b/include/net/seg6_local.h
> @@ -21,10 +21,12 @@
>  
>  extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
>  u32 tbl_id);
> +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb);
>  
>  struct seg6_bpf_srh_state {
> - bool valid;
> + struct ipv6_sr_hdr *srh;
>   u16 hdrlen;
> + bool valid;
"valid" is a bool, so it is easier to read
if true/false is used in srh_state->valid = true/false;

>  };
>  
>  DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 104d560946da..2cdea7d05063 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4542,14 +4542,13 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff 
> *, skb, u32, offset,
>  {
>   struct seg6_bpf_srh_state *srh_state =
>   this_cpu_ptr(_bpf_srh_states);
> + struct ipv6_sr_hdr *srh = srh_state->srh;
>   void *srh_tlvs, *srh_end, *ptr;
> - struct ipv6_sr_hdr *srh;
>   int srhoff = 0;
>  
> - if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> + if (srh == NULL)
>   return -EINVAL;
>  
> - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>   srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
>   srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
>  
> @@ -4562,6 +4561,9 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, 
> skb, u32, offset,
>  
>   if (unlikely(bpf_try_make_writable(skb, offset + len)))
>   return -EFAULT;
> + if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> + return -EINVAL;
> + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>  
>   memcpy(skb->data + offset, from, len);
>   return 0;
> @@ -4577,52 +4579,79 @@ static const struct bpf_func_proto 
> bpf_lwt_seg6_store_bytes_proto = {
>   .arg4_type  = ARG_CONST_SIZE
>  };
>  
> -BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
> -u32, action, void *, param, u32, param_len)
> +static void bpf_update_srh_state(struct sk_buff *skb)
>  {
>   struct seg6_bpf_srh_state *srh_state =
>   this_cpu_ptr(_bpf_srh_states);
> - struct ipv6_sr_hdr *srh;
>   int srhoff = 0;
> - int err;
> -
> - if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> - return -EINVAL;
> - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
> -
> - if (!srh_state->valid) {
> - if (unlikely((srh_state->hdrlen & 7) != 0))
> - return -EBADMSG;
> -
> - srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
> - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
> - return -EBADMSG;
>  
> + if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0) {
> + srh_state->srh = NULL;
> + } else {
> + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
> + srh_state->hdrlen = srh_state->srh->hdrlen << 3;
>   srh_state->valid = 1;
e.g. here

>   }
> +}
> +
> +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
> +u32, action, void *, param, u32, param_len)
> +{
> + struct seg6_bpf_srh_state *srh_state =
> + this_cpu_ptr(_bpf_srh_states);
> + int hdroff = 0;
> + int err;
>  
>   switch (action) {
>   case SEG6_LOCAL_ACTION_END_X:
> + if 

Re: [PATCH V2 bpf] xdp: add NULL pointer check in __xdp_return()

2018-07-25 Thread Martin KaFai Lau
On Thu, Jul 26, 2018 at 12:09:50AM +0900, Taehee Yoo wrote:
> rhashtable_lookup() can return NULL. so that NULL pointer
> check routine should be added.
> 
> Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY")
> Signed-off-by: Taehee Yoo 
Acked-by: Martin KaFai Lau 

> ---
> V2 : add WARN_ON_ONCE when xa is NULL.
> 
>  net/core/xdp.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 9d1f220..786fdbe 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -345,7 +345,10 @@ static void __xdp_return(void *data, struct xdp_mem_info 
> *mem, bool napi_direct,
>   rcu_read_lock();
>   /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
>   xa = rhashtable_lookup(mem_id_ht, >id, mem_id_rht_params);
> - xa->zc_alloc->free(xa->zc_alloc, handle);
> + if (!xa)
> + WARN_ON_ONCE(1);
> + else
> + xa->zc_alloc->free(xa->zc_alloc, handle);
>   rcu_read_unlock();
>   default:
>   /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
> -- 
> 2.9.3
> 


Re: [PATCH] samples/bpf: Add BTF build flags to Makefile

2018-07-25 Thread Martin KaFai Lau
On Wed, Jul 25, 2018 at 01:38:44PM -0700, Martin KaFai Lau wrote:
> On Thu, Jul 26, 2018 at 01:30:39AM +0900, Taeung Song wrote:
> > To smoothly test BTF supported binary on samples/bpf,
> > let samples/bpf/Makefile probe llc, pahole and
> > llvm-objcopy for BPF support and use them
> > like tools/testing/selftests/bpf/Makefile
> > changed from the commit c0fa1b6c3efc ("bpf: btf:
> >  Add BTF tests")
> > 
> > Cc: Martin KaFai Lau 
> > Signed-off-by: Taeung Song 
> Thanks for the patch. LGTM.
> 
> Acked-by: Martin KaFai Lau 
and it should go to bpf-next (Please use the proper tag in the
Subject, thanks!).

> 
> > ---
> >  samples/bpf/Makefile | 21 -
> >  1 file changed, 20 insertions(+), 1 deletion(-)
> > 
> > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > index 1303af10e54d..e079266360a3 100644
> > --- a/samples/bpf/Makefile
> > +++ b/samples/bpf/Makefile
> > @@ -191,6 +191,8 @@ HOSTLOADLIBES_xdpsock   += -pthread
> >  #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
> > CLANG=~/git/llvm/build/bin/clang
> >  LLC ?= llc
> >  CLANG ?= clang
> > +LLVM_OBJCOPY ?= llvm-objcopy
> > +BTF_PAHOLE ?= pahole
> >  
> >  # Detect that we're cross compiling and use the cross compiler
> >  ifdef CROSS_COMPILE
> > @@ -198,6 +200,20 @@ HOSTCC = $(CROSS_COMPILE)gcc
> >  CLANG_ARCH_ARGS = -target $(ARCH)
> >  endif
> >  
> > +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep 
> > dwarfris)
> > +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
> > +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
> > 'usage.*llvm')
> > +
> > +ifneq ($(BTF_LLC_PROBE),)
> > +ifneq ($(BTF_PAHOLE_PROBE),)
> > +ifneq ($(BTF_OBJCOPY_PROBE),)
> > +   EXTRA_CFLAGS += -g
> > +   LLC_FLAGS += -mattr=dwarfris
> > +   DWARF2BTF = y
> > +endif
> > +endif
> > +endif
> > +
> >  # Trick to allow make to be run from this directory
> >  all:
> > $(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR)
> > @@ -256,4 +272,7 @@ $(obj)/%.o: $(src)/%.c
> > -Wno-gnu-variable-sized-type-not-at-end \
> > -Wno-address-of-packed-member -Wno-tautological-compare \
> > -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
> > -   -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
> > +   -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) 
> > -filetype=obj -o $@
> > +ifeq ($(DWARF2BTF),y)
> > +   $(BTF_PAHOLE) -J $@
> > +endif
> > -- 
> > 2.17.1
> > 


Re: [PATCH] samples/bpf: Add BTF build flags to Makefile

2018-07-25 Thread Martin KaFai Lau
On Thu, Jul 26, 2018 at 01:30:39AM +0900, Taeung Song wrote:
> To smoothly test BTF supported binary on samples/bpf,
> let samples/bpf/Makefile probe llc, pahole and
> llvm-objcopy for BPF support and use them
> like tools/testing/selftests/bpf/Makefile
> changed from the commit c0fa1b6c3efc ("bpf: btf:
>  Add BTF tests")
> 
> Cc: Martin KaFai Lau 
> Signed-off-by: Taeung Song 
Thanks for the patch. LGTM.

Acked-by: Martin KaFai Lau 

> ---
>  samples/bpf/Makefile | 21 -
>  1 file changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 1303af10e54d..e079266360a3 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -191,6 +191,8 @@ HOSTLOADLIBES_xdpsock += -pthread
>  #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
> CLANG=~/git/llvm/build/bin/clang
>  LLC ?= llc
>  CLANG ?= clang
> +LLVM_OBJCOPY ?= llvm-objcopy
> +BTF_PAHOLE ?= pahole
>  
>  # Detect that we're cross compiling and use the cross compiler
>  ifdef CROSS_COMPILE
> @@ -198,6 +200,20 @@ HOSTCC = $(CROSS_COMPILE)gcc
>  CLANG_ARCH_ARGS = -target $(ARCH)
>  endif
>  
> +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
> +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
> +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
> 'usage.*llvm')
> +
> +ifneq ($(BTF_LLC_PROBE),)
> +ifneq ($(BTF_PAHOLE_PROBE),)
> +ifneq ($(BTF_OBJCOPY_PROBE),)
> + EXTRA_CFLAGS += -g
> + LLC_FLAGS += -mattr=dwarfris
> + DWARF2BTF = y
> +endif
> +endif
> +endif
> +
>  # Trick to allow make to be run from this directory
>  all:
>   $(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR)
> @@ -256,4 +272,7 @@ $(obj)/%.o: $(src)/%.c
>   -Wno-gnu-variable-sized-type-not-at-end \
>   -Wno-address-of-packed-member -Wno-tautological-compare \
>   -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
> - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
> + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) 
> -filetype=obj -o $@
> +ifeq ($(DWARF2BTF),y)
> + $(BTF_PAHOLE) -J $@
> +endif
> -- 
> 2.17.1
> 


Re: pahole + BTF was: Re: [Question] bpf: about a new 'tools/bpf/bpf_dwarf2btf'

2018-07-25 Thread Martin KaFai Lau
On Thu, Jul 26, 2018 at 04:21:31AM +0900, Taeung Song wrote:
> 
> 
> On 07/26/2018 03:27 AM, Taeung Song wrote:
> > Hi Arnaldo,
> > 
> > On 07/26/2018 02:52 AM, Arnaldo Carvalho de Melo wrote:
> > > Em Thu, Jul 26, 2018 at 02:23:32AM +0900, Taeung Song escreveu:
> > > > Hi,
> > > > 
> > > > Building bpf programs with .BTF section,
> > > > I thought it'd be better to convert dwarf info to .BTF by
> > > > a new tool such as 'tools/bpf/bpf_dwarf2btf' instead of pahole
> > > > in the future.
> > > > Currently for bpf binary that have .BTF section,
> > > > we need to use pahole from https://github.com/iamkafai/pahole/tree/btf
> > > > with the command line such as "pahole -J bpf_prog.o".
> > > > I think it is great but if implementing new 'bpf_dwarf2btf'
> > > > (dwarf parsing + btf encoder code written by Martin KaFai Lau on
> > > > the pahole project i.e. btf.h, btf_encoder.c, btf_encoder.h,
> > > > libbtf.c, libbtf.h),
> > > > BPF developers would more easily use functionalities based on BTF.
> > > 
> > > What would be easier exactly? Not having to install a package but build
> > > it from the kernel sources?
> > > 
> > > Many kernel developers already have pahole installed for other uses, so
> > > no need to install anything.
> > > 
> > 
> > Understood, but I think there are many non-kernel developers
> > developing BPF programs and they mightn't have or use pahole.
> > 
> > So, if providing the 'dwarf2btf' feature on tools/bpf or tools/bpf/bpftool,
> > non-kernel developers can also more easily build bpf prog with .BPF, no ?
Some quick thoughts,
IMO, I suspect if it is in the distro's pahole package,  it should be easy
enough for kernel and non kernel developer to install.
BTF usage is still evolving,  we might re-evaluate going forward but at this
point I think leveraging pahole's existing capability is a good option.

> > 
> 
> Or, if tools/lib/bpf/ have the 'dwarf2btf' feature,
> I think BPF developers can just use bpf programs that have dwarf info
> after compiling with clang '-g' and llc '-mattr=dwarfris', even though not
> using pahole.
> Isn't it good way ?
> 
> > > BTW, Daniel, I just pushed to pahole's main repository at:
> > > 
> > >    git://git.kernel.org/pub/scm/devel/pahole/pahole.git
> > > 
> > > with the Martin's BTF patch, so no need to pull from the github one,
> > > I'll tag v1.12 and announce the release so that distro package
> > > maintainers can update their packages.
Awesome! Thanks, Arnaldo!


- Martin


Re: selftests: bpf: test_progs: deadlock at trace_call_bpf

2018-07-25 Thread Martin KaFai Lau
On Tue, Jul 24, 2018 at 02:51:42PM +0530, Naresh Kamboju wrote:
> Deadlock warning on x86 machine while testing selftests: bpf:
> test_progs and running linux next 4.18.0-rc3-next-20180705 and still
> happening on 4.18.0-rc5-next-20180720.
> 
> Any one noticed this kernel warning about deadlock ?
It should be a false positive.  The head->lock is a percpu
lock and is acquired by the bpf prog running on that cpu when
updating a bpf htab.  Hence, CPU0 and CPU1 are acquiring
a different head->lock.

When looking at a CPU alone, another bpf prog cannot start
running on the same CPU before the currently running bpf prog
has finished.  e.g. There is a percpu "bpf_prog_active" counter
to ensure that in the tracing side.

The head->lock is primary used in bpf htab update which
is used very heavily in most of the bpf progs.  Hence,
replacing the lock with the irqsave version is unnecessary
while having performance impact.

Thanks,
Martin

> 
> selftests: bpf: test_progs
> libbpf: incorrect bpf_call opcode
> libbpf: incorrect bpf_call opcode
> test_pkt_access:FAIL:ipv4 err 0 errno 2 retval 0 duration 126
> test_pkt_access:FAIL:ipv6 err 0 errno 2 retval 0 duration 115
> test_xdp:FAIL:ipv4 err 0 errno 2 retval 3 size 74
> test_xdp:FAIL:ipv6 err 0 errno 2 retval 3 size 114
> test_xdp_adjust_tail:FAIL:ipv4 err 0 errno 2 retval 1 size 54
> test_xdp_adjust_tail:FAIL:ipv6 err 0 errno 2 retval 3 siz[   69.901655]
> [   69.903862] 
> [   69.910213] WARNING: possible irq lock inversion dependency detected
> [   69.916559] 4.18.0-rc3-next-20180705 #1 Not tainted
> [   69.921428] 
> [   69.927774] dd/2928 just changed the state of lock:
> [   69.932643] 22eeb38d (>lock){+...}, at:
> pcpu_freelist_push+0x28/0x50
> [   69.940208] but this lock was taken by another, HARDIRQ-safe lock
> in the past:
> [   69.947420]  (>lock){-.-.}
> [   69.947421]
> [   69.947421]
> [   69.947421] and interrupts could create inverse lock ordering between them.
> [   69.947421]
> [   69.961842]
> [   69.961842] other info that might help us debug this:
> [   69.968357]  Possible interrupt unsafe locking scenario:
> [   69.968357]
> [   69.975136]CPU0CPU1
> [   69.979659]
> [   69.984184]   lock(>lock);
> [   69.987406]local_irq_disable();
> [   69.993319]lock(>lock);
> [   69.998882]lock(>lock);
> [   70.004618]   
> [   70.007235] lock(>lock);
> [   70.010461]
> [   70.010461]  *** DEADLOCK ***
> [   70.010461]
> [   70.016372] 1 lock held by dd/2928:
> [   70.019856]  #0: ab9293c8 (rcu_read_lock){}, at:
> trace_call_bpf+0x37/0x1d0
> [   70.027768]
> [   70.027768] the shortest dependencies between 2nd lock and 1st lock:
> [   70.035586]  -> (>lock){-.-.} ops: 1401365 {
> [   70.040204] IN-HARDIRQ-W at:
> [   70.043428]   lock_acquire+0xd5/0x1c0
> [   70.048820]   _raw_spin_lock+0x2f/0x40
> [   70.054299]   scheduler_tick+0x51/0xf0
> [   70.059781]   update_process_times+0x47/0x60
> [   70.065779]   tick_periodic+0x2b/0xc0
> [   70.071171]   tick_handle_periodic+0x25/0x70
> [   70.077168]   timer_interrupt+0x15/0x20
> [   70.082731]   __handle_irq_event_percpu+0x48/0x320
> [   70.089250]   handle_irq_event_percpu+0x32/0x80
> [   70.095505]   handle_irq_event+0x39/0x60
> [   70.101157]   handle_level_irq+0x7f/0x100
> [   70.106893]   handle_irq+0x6f/0x110
> [   70.112112]   do_IRQ+0x5c/0x110
> [   70.116982]   ret_from_intr+0x0/0x1d
> [   70.122286]   _raw_spin_unlock_irqrestore+0x38/0x50
> [   70.128891]   __setup_irq+0x45d/0x700
> [   70.134281]   setup_irq+0x4c/0x90
> [   70.139324]   hpet_time_init+0x25/0x37
> [   70.144803]   x86_late_time_init+0xf/0x1c
> [   70.150538]   start_kernel+0x40c/0x4ca
> [   70.156017]   x86_64_start_reservations+0x24/0x26
> [   70.162445]   x86_64_start_kernel+0x6f/0x72
> [   70.168357]   secondary_startup_64+0xa4/0xb0
> [   70.174356] IN-SOFTIRQ-W at:
> [   70.177578]   lock_acquire+0xd5/0x1c0
> [   70.182970]   _raw_spin_lock+0x2f/0x40
> [   70.188448]   try_to_wake_up+0x31b/0x540
> [   70.194097]   wake_up_process+0x15/0x20
> [   70.199661]   swake_up_locked+0x24/0x40
> [   70.205226]   swake_up_one+0x1f/0x30
> [   70.210530]  

Re: [PATCH bpf-next] bpf: add End.DT6 action to bpf_lwt_seg6_action helper

2018-07-24 Thread Martin KaFai Lau
On Tue, Jul 24, 2018 at 04:59:54PM +, Mathieu Xhonneux wrote:
> The seg6local LWT provides the End.DT6 action, which allows to
> decapsulate an outer IPv6 header containing a Segment Routing Header
> (SRH), full specification is available here:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_html_draft-2Dfilsfils-2Dspring-2Dsrv6-2Dnetwork-2Dprogramming-2D05=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=VQnoQ7LvghIj0gVEaiQSUw=c61PGnhPMmCUcL5lpyBsxOmsBU2mU5KFY0-Ioo-pBC4=mzShtRc5ofzfknAuqoehbGN1ifA17aKihiVLJVfkuZ8=
> 
> This patch adds this action now to the seg6local BPF
> interface. Since it is not mandatory that the inner IPv6 header also
> contains a SRH, seg6_bpf_srh_state has been extended with a pointer to
> a possible SRH of the outermost IPv6 header. This helps assessing if the
> validation must be triggered or not, and avoids some calls to
> ipv6_find_hdr.
> 
> Signed-off-by: Mathieu Xhonneux 
> ---
>  include/net/seg6_local.h |  4 ++-
>  net/core/filter.c| 83 
> +---
>  net/ipv6/seg6_local.c| 42 +++-
>  3 files changed, 87 insertions(+), 42 deletions(-)
> 
> diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
> index 661fd5b4d3e0..08359e2d8b35 100644
> --- a/include/net/seg6_local.h
> +++ b/include/net/seg6_local.h
> @@ -21,10 +21,12 @@
>  
>  extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
>  u32 tbl_id);
> +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb);
>  
>  struct seg6_bpf_srh_state {
> - bool valid;
> + struct ipv6_sr_hdr *srh;
>   u16 hdrlen;
> + bool valid;
>  };
>  
>  DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 104d560946da..2cdea7d05063 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4542,14 +4542,13 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff 
> *, skb, u32, offset,
>  {
>   struct seg6_bpf_srh_state *srh_state =
>   this_cpu_ptr(_bpf_srh_states);
> + struct ipv6_sr_hdr *srh = srh_state->srh;
>   void *srh_tlvs, *srh_end, *ptr;
> - struct ipv6_sr_hdr *srh;
>   int srhoff = 0;
>  
> - if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> + if (srh == NULL)
>   return -EINVAL;
>  
> - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>   srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
>   srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
>  
> @@ -4562,6 +4561,9 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, 
> skb, u32, offset,
>  
>   if (unlikely(bpf_try_make_writable(skb, offset + len)))
>   return -EFAULT;
> + if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> + return -EINVAL;
> + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>  
>   memcpy(skb->data + offset, from, len);
>   return 0;
> @@ -4577,52 +4579,79 @@ static const struct bpf_func_proto 
> bpf_lwt_seg6_store_bytes_proto = {
>   .arg4_type  = ARG_CONST_SIZE
>  };
>  
> -BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
> -u32, action, void *, param, u32, param_len)
> +static void bpf_update_srh_state(struct sk_buff *skb)
>  {
>   struct seg6_bpf_srh_state *srh_state =
>   this_cpu_ptr(_bpf_srh_states);
> - struct ipv6_sr_hdr *srh;
>   int srhoff = 0;
> - int err;
> -
> - if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0)
> - return -EINVAL;
> - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
> -
> - if (!srh_state->valid) {
> - if (unlikely((srh_state->hdrlen & 7) != 0))
> - return -EBADMSG;
> -
> - srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
> - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
> - return -EBADMSG;
>  
> + if (ipv6_find_hdr(skb, , IPPROTO_ROUTING, NULL, NULL) < 0) {
> + srh_state->srh = NULL;
> + } else {
> + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
> + srh_state->hdrlen = srh_state->srh->hdrlen << 3;
>   srh_state->valid = 1;
>   }
> +}
> +
> +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
> +u32, action, void *, param, u32, param_len)
> +{
> + struct seg6_bpf_srh_state *srh_state =
> + this_cpu_ptr(_bpf_srh_states);
> + int hdroff = 0;
> + int err;
>  
>   switch (action) {
>   case SEG6_LOCAL_ACTION_END_X:
> + if (!seg6_bpf_has_valid_srh(skb))
> + return -EBADMSG;
>   if (param_len != sizeof(struct in6_addr))
>   return -EINVAL;
>   return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
>   case SEG6_LOCAL_ACTION_END_T:
> + if 

[PATCH v3 bpf 1/3] bpf: btf: Sync uapi btf.h to tools

2018-07-24 Thread Martin KaFai Lau
This patch sync the uapi btf.h to tools/

Fixes: 36fc3c8c282c bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h
Signed-off-by: Martin KaFai Lau 
Acked-by: Yonghong Song 
---
 tools/include/uapi/linux/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 0b5ddbe135a4..972265f32871 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -76,7 +76,7 @@ struct btf_type {
  */
 #define BTF_INT_ENCODING(VAL)  (((VAL) & 0x0f00) >> 24)
 #define BTF_INT_OFFSET(VAL)(((VAL  & 0x00ff)) >> 16)
-#define BTF_INT_BITS(VAL)  ((VAL)  & 0x)
+#define BTF_INT_BITS(VAL)  ((VAL)  & 0x00ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
 #define BTF_INT_SIGNED (1 << 0)
-- 
2.17.1



[PATCH v3 bpf 2/3] bpf: Replace [u]int32_t and [u]int64_t in libbpf

2018-07-24 Thread Martin KaFai Lau
This patch replaces [u]int32_t and [u]int64_t usage with
__[su]32 and __[su]64.  The same change goes for [u]int16_t
and [u]int8_t.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c| 34 --
 tools/lib/bpf/btf.h|  8 
 tools/lib/bpf/libbpf.c | 12 ++--
 tools/lib/bpf/libbpf.h |  4 ++--
 4 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8c54a4b6f187..b80de80b4584 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2018 Facebook */
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -27,13 +26,13 @@ struct btf {
struct btf_type **types;
const char *strings;
void *nohdr_data;
-   uint32_t nr_types;
-   uint32_t types_size;
-   uint32_t data_size;
+   __u32 nr_types;
+   __u32 types_size;
+   __u32 data_size;
int fd;
 };
 
-static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
+static const char *btf_name_by_offset(const struct btf *btf, __u32 offset)
 {
if (offset < btf->hdr->str_len)
return >strings[offset];
@@ -45,7 +44,7 @@ static int btf_add_type(struct btf *btf, struct btf_type *t)
 {
if (btf->types_size - btf->nr_types < 2) {
struct btf_type **new_types;
-   u32 expand_by, new_size;
+   __u32 expand_by, new_size;
 
if (btf->types_size == BTF_MAX_NR_TYPES)
return -E2BIG;
@@ -72,7 +71,7 @@ static int btf_add_type(struct btf *btf, struct btf_type *t)
 static int btf_parse_hdr(struct btf *btf, btf_print_fn_t err_log)
 {
const struct btf_header *hdr = btf->hdr;
-   u32 meta_left;
+   __u32 meta_left;
 
if (btf->data_size < sizeof(struct btf_header)) {
elog("BTF header not found\n");
@@ -151,7 +150,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 
while (next_type < end_type) {
struct btf_type *t = next_type;
-   uint16_t vlen = BTF_INFO_VLEN(t->info);
+   __u16 vlen = BTF_INFO_VLEN(t->info);
int err;
 
next_type += sizeof(*t);
@@ -191,7 +190,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 }
 
 static const struct btf_type *btf_type_by_id(const struct btf *btf,
-uint32_t type_id)
+__u32 type_id)
 {
if (type_id > btf->nr_types)
return NULL;
@@ -209,7 +208,7 @@ static bool btf_type_is_void_or_null(const struct btf_type 
*t)
return !t || btf_type_is_void(t);
 }
 
-static int64_t btf_type_size(const struct btf_type *t)
+static __s64 btf_type_size(const struct btf_type *t)
 {
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_INT:
@@ -226,12 +225,12 @@ static int64_t btf_type_size(const struct btf_type *t)
 
 #define MAX_RESOLVE_DEPTH 32
 
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
+__s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 {
const struct btf_array *array;
const struct btf_type *t;
-   uint32_t nelems = 1;
-   int64_t size = -1;
+   __u32 nelems = 1;
+   __s64 size = -1;
int i;
 
t = btf_type_by_id(btf, type_id);
@@ -271,9 +270,9 @@ int64_t btf__resolve_size(const struct btf *btf, uint32_t 
type_id)
return nelems * size;
 }
 
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
 {
-   uint32_t i;
+   __u32 i;
 
if (!strcmp(type_name, "void"))
return 0;
@@ -302,10 +301,9 @@ void btf__free(struct btf *btf)
free(btf);
 }
 
-struct btf *btf__new(uint8_t *data, uint32_t size,
-btf_print_fn_t err_log)
+struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log)
 {
-   uint32_t log_buf_size = 0;
+   __u32 log_buf_size = 0;
char *log_buf = NULL;
struct btf *btf;
int err;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 74bb344035bb..ed3a84370ccc 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -4,7 +4,7 @@
 #ifndef __BPF_BTF_H
 #define __BPF_BTF_H
 
-#include 
+#include 
 
 #define BTF_ELF_SEC ".BTF"
 
@@ -14,9 +14,9 @@ typedef int (*btf_print_fn_t)(const char *, ...)
__attribute__((format(printf, 1, 2)));
 
 void btf__free(struct btf *btf);
-struct btf *btf__new(uint8_t *data, uint32_t size, btf_print_fn_t err_log);
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name);
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id);
+struct btf *btf__new(__u8

[PATCH v3 bpf 0/3] Introduce BPF_ANNOTATE_KV_PAIR

2018-07-24 Thread Martin KaFai Lau
The series allows the BPF loader to figure out
the btf_key_id and btf_value_id from a map's name
by using BPF_ANNOTATE_KV_PAIR.  It also removes
the old 'typedef' way which requires two separate
typedefs (one for the key and one for the value).

By doing this, iproute2 and libbpf have one
consistent way to figure out the btf_key_type_id and
btf_value_type_id for a map.

The first two patches are some prep/cleanup works.
The last patch introduces BPF_ANNOTATE_KV_PAIR.

v3:
- Replace some more *int*_t and u* usages with the
  equivalent __[su]* in btf.c
v2:
- Fix the incorrect '&&' check on container_type
  in bpf_map_find_btf_info().
- Expose the existing static btf_type_by_id() instead of
  creating a new one.

Martin KaFai Lau (3):
  bpf: btf: Sync uapi btf.h to tools
  bpf: Replace [u]int32_t and [u]int64_t in libbpf
  bpf: Introduce BPF_ANNOTATE_KV_PAIR

 tools/include/uapi/linux/btf.h   |  2 +-
 tools/lib/bpf/btf.c  | 39 +
 tools/lib/bpf/btf.h  | 10 ++-
 tools/lib/bpf/libbpf.c   | 85 +++-
 tools/lib/bpf/libbpf.h   |  4 +-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 7 files changed, 83 insertions(+), 73 deletions(-)

-- 
2.17.1



[PATCH v3 bpf 3/3] bpf: Introduce BPF_ANNOTATE_KV_PAIR

2018-07-24 Thread Martin KaFai Lau
This patch introduces BPF_ANNOTATE_KV_PAIR to signal the
bpf loader about the btf key_type and value_type of a bpf map.
Please refer to the changes in test_btf_haskv.c for its usage.
Both iproute2 and libbpf loader will then have the same
convention to find out the map's btf_key_type_id and
btf_value_type_id from a map's name.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Suggested-by: Daniel Borkmann 
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c  |  7 +-
 tools/lib/bpf/btf.h  |  2 +
 tools/lib/bpf/libbpf.c   | 75 +++-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 5 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b80de80b4584..2d270c560df3 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -189,8 +189,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
return 0;
 }
 
-static const struct btf_type *btf_type_by_id(const struct btf *btf,
-__u32 type_id)
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id)
 {
if (type_id > btf->nr_types)
return NULL;
@@ -233,7 +232,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
type_id)
__s64 size = -1;
int i;
 
-   t = btf_type_by_id(btf, type_id);
+   t = btf__type_by_id(btf, type_id);
for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
 i++) {
size = btf_type_size(t);
@@ -258,7 +257,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
type_id)
return -EINVAL;
}
 
-   t = btf_type_by_id(btf, type_id);
+   t = btf__type_by_id(btf, type_id);
}
 
if (size < 0)
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index ed3a84370ccc..e2a09a155f84 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -9,6 +9,7 @@
 #define BTF_ELF_SEC ".BTF"
 
 struct btf;
+struct btf_type;
 
 typedef int (*btf_print_fn_t)(const char *, ...)
__attribute__((format(printf, 1, 2)));
@@ -16,6 +17,7 @@ typedef int (*btf_print_fn_t)(const char *, ...)
 void btf__free(struct btf *btf);
 struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 __s32 btf__find_by_name(const struct btf *btf, const char *type_name);
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
 __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6deb4fe4fffe..d881d370616c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1014,68 +1015,72 @@ bpf_program__collect_reloc(struct bpf_program *prog, 
GElf_Shdr *shdr,
 
 static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 {
+   const struct btf_type *container_type;
+   const struct btf_member *key, *value;
struct bpf_map_def *def = >def;
const size_t max_name = 256;
+   char container_name[max_name];
__s64 key_size, value_size;
-   __s32 key_id, value_id;
-   char name[max_name];
+   __s32 container_id;
 
-   /* Find key type by name from BTF */
-   if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
-   pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
+   if (snprintf(container_name, max_name, "btf_map_%s", map->name) ==
+   max_name) {
+   pr_warning("map:%s length of 'btf_map_%s' is too long\n",
   map->name, map->name);
return -EINVAL;
}
 
-   key_id = btf__find_by_name(btf, name);
-   if (key_id < 0) {
-   pr_debug("map:%s key_type:%s cannot be found in BTF\n",
-map->name, name);
-   return key_id;
+   container_id = btf__find_by_name(btf, container_name);
+   if (container_id < 0) {
+   pr_debug("map:%s container_name:%s cannot be found in BTF. 
Missing BPF_ANNOTATE_KV_PAIR?\n",
+map->name, container_name);
+   return container_id;
}
 
-   key_size = btf__resolve_size(btf, key_id);
-   if (key_size < 0) {
-   pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
-  map->name, name);
-   return key_size;
+   container_type = btf__type_by_id(btf, container_id);
+   if (!container_type) {
+   pr_warning("map:%s cannot fi

Re: [PATCH v2 bpf 2/3] bpf: Replace [u]int32_t and [u]int64_t in libbpf

2018-07-23 Thread Martin KaFai Lau
On Mon, Jul 23, 2018 at 11:04:34AM -0700, Yonghong Song wrote:
> 
> 
> On 7/21/18 11:20 AM, Martin KaFai Lau wrote:
> > This patch replaces [u]int32_t and [u]int64_t usage with
> > __[su]32 and __[su]64.  The same change goes for [u]int16_t
> > and [u]int8_t.
> > 
> > Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
> > Signed-off-by: Martin KaFai Lau 
> > ---
> >   tools/lib/bpf/btf.c| 28 +---
> >   tools/lib/bpf/btf.h|  8 
> >   tools/lib/bpf/libbpf.c | 12 ++--
> >   tools/lib/bpf/libbpf.h |  4 ++--
> >   4 files changed, 25 insertions(+), 27 deletions(-)
> > 
> > diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
> > index 8c54a4b6f187..ce77b5b57912 100644
> > --- a/tools/lib/bpf/btf.c
> > +++ b/tools/lib/bpf/btf.c
> > @@ -2,7 +2,6 @@
> >   /* Copyright (c) 2018 Facebook */
> >   #include 
> > -#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -27,13 +26,13 @@ struct btf {
> > struct btf_type **types;
> > const char *strings;
> > void *nohdr_data;
> > -   uint32_t nr_types;
> > -   uint32_t types_size;
> > -   uint32_t data_size;
> > +   __u32 nr_types;
> > +   __u32 types_size;
> > +   __u32 data_size;
> > int fd;
> >   };
> > -static const char *btf_name_by_offset(const struct btf *btf, uint32_t 
> > offset)
> > +static const char *btf_name_by_offset(const struct btf *btf, __u32 offset)
> >   {
> > if (offset < btf->hdr->str_len)
> > return >strings[offset];
> > @@ -151,7 +150,7 @@ static int btf_parse_type_sec(struct btf *btf, 
> > btf_print_fn_t err_log)
> > while (next_type < end_type) {
> > struct btf_type *t = next_type;
> > -   uint16_t vlen = BTF_INFO_VLEN(t->info);
> > +   __u16 vlen = BTF_INFO_VLEN(t->info);
> > int err;
> > next_type += sizeof(*t);
> > @@ -191,7 +190,7 @@ static int btf_parse_type_sec(struct btf *btf, 
> > btf_print_fn_t err_log)
> >   }
> >   static const struct btf_type *btf_type_by_id(const struct btf *btf,
> > -uint32_t type_id)
> > +__u32 type_id)
> >   {
> > if (type_id > btf->nr_types)
> > return NULL;
> > @@ -226,12 +225,12 @@ static int64_t btf_type_size(const struct btf_type *t)
> 
> Missing this one:
>static int64_t btf_type_size(const struct btf_type *t)
> 
> There are a couple of instances of using u32 instead of __u32, better to use
> __u32 everywhere in the same file:
> u32 expand_by, new_size;
> u32 meta_left;
Thanks for pointing them out.  Will make the changes.

> 
> 
> >   #define MAX_RESOLVE_DEPTH 32
> > -int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
> > +__s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
> >   {
> > const struct btf_array *array;
> > const struct btf_type *t;
> > -   uint32_t nelems = 1;
> > -   int64_t size = -1;
> > +   __u32 nelems = 1;
> > +   __s64 size = -1;
> > int i;
> > t = btf_type_by_id(btf, type_id);
> > @@ -271,9 +270,9 @@ int64_t btf__resolve_size(const struct btf *btf, 
> > uint32_t type_id)
> > return nelems * size;
> >   }
> > -int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
> > +__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
> >   {
> > -   uint32_t i;
> > +   __u32 i;
> > if (!strcmp(type_name, "void"))
> > return 0;
> > @@ -302,10 +301,9 @@ void btf__free(struct btf *btf)
> > free(btf);
> >   }
> > -struct btf *btf__new(uint8_t *data, uint32_t size,
> > -btf_print_fn_t err_log)
> > +struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log)
> >   {
> > -   uint32_t log_buf_size = 0;
> > +   __u32 log_buf_size = 0;
> > char *log_buf = NULL;
> > struct btf *btf;
> > int err;
> > diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
> > index 74bb344035bb..ed3a84370ccc 100644
> > --- a/tools/lib/bpf/btf.h
> > +++ b/tools/lib/bpf/btf.h
> > @@ -4,7 +4,7 @@
> >   #ifndef __BPF_BTF_H
> >   #define __BPF_BTF_H
> > -#include 
> > +#include 
> >   #define BTF_ELF_SEC ".BTF"
> > @@ -14,9 +14,9 @@ typedef int (*btf_print_fn_t)(const char *, ...)
> > __attribute

Re: [PATCH v2 bpf 3/3] bpf: Introduce BPF_ANNOTATE_KV_PAIR

2018-07-23 Thread Martin KaFai Lau
On Mon, Jul 23, 2018 at 11:31:43AM -0700, Yonghong Song wrote:
> 
> 
> On 7/21/18 11:20 AM, Martin KaFai Lau wrote:
> > This patch introduces BPF_ANNOTATE_KV_PAIR to signal the
> > bpf loader about the btf key_type and value_type of a bpf map.
> > Please refer to the changes in test_btf_haskv.c for its usage.
> > Both iproute2 and libbpf loader will then have the same
> > convention to find out the map's btf_key_type_id and
> > btf_value_type_id from a map's name.
> > 
> > Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
> > Suggested-by: Daniel Borkmann 
> > Signed-off-by: Martin KaFai Lau 
> > ---
> >   tools/lib/bpf/btf.c  |  7 +-
> >   tools/lib/bpf/btf.h  |  2 +
> >   tools/lib/bpf/libbpf.c   | 75 +++-
> >   tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
> >   tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
> >   5 files changed, 56 insertions(+), 44 deletions(-)
> > 
> > diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
> > index ce77b5b57912..321a99e648ed 100644
> > --- a/tools/lib/bpf/btf.c
> > +++ b/tools/lib/bpf/btf.c
> > @@ -189,8 +189,7 @@ static int btf_parse_type_sec(struct btf *btf, 
> > btf_print_fn_t err_log)
> > return 0;
> >   }
> > -static const struct btf_type *btf_type_by_id(const struct btf *btf,
> > -__u32 type_id)
> > +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 
> > type_id)
> >   {
> > if (type_id > btf->nr_types)
> > return NULL;
> > @@ -233,7 +232,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
> > type_id)
> > __s64 size = -1;
> > int i;
> > -   t = btf_type_by_id(btf, type_id);
> > +   t = btf__type_by_id(btf, type_id);
> > for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
> >  i++) {
> > size = btf_type_size(t);
> > @@ -258,7 +257,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
> > type_id)
> > return -EINVAL;
> > }
> > -   t = btf_type_by_id(btf, type_id);
> > +   t = btf__type_by_id(btf, type_id);
> > }
> > if (size < 0)
> > diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
> > index ed3a84370ccc..e2a09a155f84 100644
> > --- a/tools/lib/bpf/btf.h
> > +++ b/tools/lib/bpf/btf.h
> > @@ -9,6 +9,7 @@
> >   #define BTF_ELF_SEC ".BTF"
> >   struct btf;
> > +struct btf_type;
> >   typedef int (*btf_print_fn_t)(const char *, ...)
> > __attribute__((format(printf, 1, 2)));
> > @@ -16,6 +17,7 @@ typedef int (*btf_print_fn_t)(const char *, ...)
> >   void btf__free(struct btf *btf);
> >   struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
> >   __s32 btf__find_by_name(const struct btf *btf, const char *type_name);
> > +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
> >   __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
> >   int btf__fd(const struct btf *btf);
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index 6deb4fe4fffe..d881d370616c 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> > @@ -36,6 +36,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -1014,68 +1015,72 @@ bpf_program__collect_reloc(struct bpf_program 
> > *prog, GElf_Shdr *shdr,
> >   static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf 
> > *btf)
> >   {
> > +   const struct btf_type *container_type;
> > +   const struct btf_member *key, *value;
> > struct bpf_map_def *def = >def;
> > const size_t max_name = 256;
> > +   char container_name[max_name];
> > __s64 key_size, value_size;
> > -   __s32 key_id, value_id;
> > -   char name[max_name];
> > +   __s32 container_id;
> > -   /* Find key type by name from BTF */
> > -   if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
> > -   pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
> > +   if (snprintf(container_name, max_name, "btf_map_%s", map->name) ==
> > +   max_name) {
> > +   pr_warning("map:%s length of 'btf_map_%s' is too long\n",
> >map->name, map->name);
&g

Re: [PATCH net] ipv6: use fib6_info_hold_safe() when necessary

2018-07-23 Thread Martin KaFai Lau
On Sat, Jul 21, 2018 at 08:56:32PM -0700, Wei Wang wrote:
> From: Wei Wang 
> 
> In the code path where only rcu read lock is held, e.g. in the route
> lookup code path, it is not safe to directly call fib6_info_hold()
> because the fib6_info may already have been deleted but still exists
> in the rcu grace period. Holding reference to it could cause double
> free and crash the kernel.
> 
> This patch adds a new function fib6_info_hold_safe() and replace
> fib6_info_hold() in all necessary places.
Acked-by: Martin KaFai Lau 


[PATCH v2 bpf 0/3] Introduce BPF_ANNOTATE_KV_PAIR

2018-07-21 Thread Martin KaFai Lau
The series allows the BPF loader to figure out
the btf_key_id and btf_value_id from a map's name
by using BPF_ANNOTATE_KV_PAIR.  It also removes
the old 'typedef' way which requires two separate
typedefs (one for the key and one for the value).

By doing this, iproute2 and libbpf have one
consistent way to figure out the btf_key_type_id and
btf_value_type_id for a map.

The first two patches are some prep/cleanup works.
The last patch introduces BPF_ANNOTATE_KV_PAIR.

v2:
- Fix the incorrect '&&' check on container_type
  in bpf_map_find_btf_info().
- Expose the existing static btf_type_by_id() instead of
  creating a new one.

Martin KaFai Lau (3):
  bpf: btf: Sync uapi btf.h to tools
  bpf: Replace [u]int32_t and [u]int64_t in libbpf
  bpf: Introduce BPF_ANNOTATE_KV_PAIR

 tools/include/uapi/linux/btf.h   |  2 +-
 tools/lib/bpf/btf.c  | 33 
 tools/lib/bpf/btf.h  | 10 ++-
 tools/lib/bpf/libbpf.c   | 85 +++-
 tools/lib/bpf/libbpf.h   |  4 +-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 7 files changed, 80 insertions(+), 70 deletions(-)

-- 
2.17.1



[PATCH v2 bpf 2/3] bpf: Replace [u]int32_t and [u]int64_t in libbpf

2018-07-21 Thread Martin KaFai Lau
This patch replaces [u]int32_t and [u]int64_t usage with
__[su]32 and __[su]64.  The same change goes for [u]int16_t
and [u]int8_t.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c| 28 +---
 tools/lib/bpf/btf.h|  8 
 tools/lib/bpf/libbpf.c | 12 ++--
 tools/lib/bpf/libbpf.h |  4 ++--
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8c54a4b6f187..ce77b5b57912 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2018 Facebook */
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -27,13 +26,13 @@ struct btf {
struct btf_type **types;
const char *strings;
void *nohdr_data;
-   uint32_t nr_types;
-   uint32_t types_size;
-   uint32_t data_size;
+   __u32 nr_types;
+   __u32 types_size;
+   __u32 data_size;
int fd;
 };
 
-static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
+static const char *btf_name_by_offset(const struct btf *btf, __u32 offset)
 {
if (offset < btf->hdr->str_len)
return >strings[offset];
@@ -151,7 +150,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 
while (next_type < end_type) {
struct btf_type *t = next_type;
-   uint16_t vlen = BTF_INFO_VLEN(t->info);
+   __u16 vlen = BTF_INFO_VLEN(t->info);
int err;
 
next_type += sizeof(*t);
@@ -191,7 +190,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 }
 
 static const struct btf_type *btf_type_by_id(const struct btf *btf,
-uint32_t type_id)
+__u32 type_id)
 {
if (type_id > btf->nr_types)
return NULL;
@@ -226,12 +225,12 @@ static int64_t btf_type_size(const struct btf_type *t)
 
 #define MAX_RESOLVE_DEPTH 32
 
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
+__s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 {
const struct btf_array *array;
const struct btf_type *t;
-   uint32_t nelems = 1;
-   int64_t size = -1;
+   __u32 nelems = 1;
+   __s64 size = -1;
int i;
 
t = btf_type_by_id(btf, type_id);
@@ -271,9 +270,9 @@ int64_t btf__resolve_size(const struct btf *btf, uint32_t 
type_id)
return nelems * size;
 }
 
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
 {
-   uint32_t i;
+   __u32 i;
 
if (!strcmp(type_name, "void"))
return 0;
@@ -302,10 +301,9 @@ void btf__free(struct btf *btf)
free(btf);
 }
 
-struct btf *btf__new(uint8_t *data, uint32_t size,
-btf_print_fn_t err_log)
+struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log)
 {
-   uint32_t log_buf_size = 0;
+   __u32 log_buf_size = 0;
char *log_buf = NULL;
struct btf *btf;
int err;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 74bb344035bb..ed3a84370ccc 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -4,7 +4,7 @@
 #ifndef __BPF_BTF_H
 #define __BPF_BTF_H
 
-#include 
+#include 
 
 #define BTF_ELF_SEC ".BTF"
 
@@ -14,9 +14,9 @@ typedef int (*btf_print_fn_t)(const char *, ...)
__attribute__((format(printf, 1, 2)));
 
 void btf__free(struct btf *btf);
-struct btf *btf__new(uint8_t *data, uint32_t size, btf_print_fn_t err_log);
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name);
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id);
+struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name);
+__s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1e96b5de5ff..6deb4fe4fffe 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -216,8 +216,8 @@ struct bpf_map {
size_t offset;
int map_ifindex;
struct bpf_map_def def;
-   uint32_t btf_key_type_id;
-   uint32_t btf_value_type_id;
+   __u32 btf_key_type_id;
+   __u32 btf_value_type_id;
void *priv;
bpf_map_clear_priv_t clear_priv;
 };
@@ -1016,8 +1016,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, 
const struct btf *btf)
 {
struct bpf_map_def *def = >def;
const size_t max_name = 256;
-   int64_t key_size, value_size;
-   int32_t key_id, value_id;
+   __s64 key_size, value_size;
+   __s32 key_id, value_id;
char name[max_name];
 
/* Fin

[PATCH v2 bpf 3/3] bpf: Introduce BPF_ANNOTATE_KV_PAIR

2018-07-21 Thread Martin KaFai Lau
This patch introduces BPF_ANNOTATE_KV_PAIR to signal the
bpf loader about the btf key_type and value_type of a bpf map.
Please refer to the changes in test_btf_haskv.c for its usage.
Both iproute2 and libbpf loader will then have the same
convention to find out the map's btf_key_type_id and
btf_value_type_id from a map's name.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Suggested-by: Daniel Borkmann 
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c  |  7 +-
 tools/lib/bpf/btf.h  |  2 +
 tools/lib/bpf/libbpf.c   | 75 +++-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 5 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index ce77b5b57912..321a99e648ed 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -189,8 +189,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
return 0;
 }
 
-static const struct btf_type *btf_type_by_id(const struct btf *btf,
-__u32 type_id)
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id)
 {
if (type_id > btf->nr_types)
return NULL;
@@ -233,7 +232,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
type_id)
__s64 size = -1;
int i;
 
-   t = btf_type_by_id(btf, type_id);
+   t = btf__type_by_id(btf, type_id);
for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
 i++) {
size = btf_type_size(t);
@@ -258,7 +257,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 
type_id)
return -EINVAL;
}
 
-   t = btf_type_by_id(btf, type_id);
+   t = btf__type_by_id(btf, type_id);
}
 
if (size < 0)
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index ed3a84370ccc..e2a09a155f84 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -9,6 +9,7 @@
 #define BTF_ELF_SEC ".BTF"
 
 struct btf;
+struct btf_type;
 
 typedef int (*btf_print_fn_t)(const char *, ...)
__attribute__((format(printf, 1, 2)));
@@ -16,6 +17,7 @@ typedef int (*btf_print_fn_t)(const char *, ...)
 void btf__free(struct btf *btf);
 struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 __s32 btf__find_by_name(const struct btf *btf, const char *type_name);
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
 __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6deb4fe4fffe..d881d370616c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1014,68 +1015,72 @@ bpf_program__collect_reloc(struct bpf_program *prog, 
GElf_Shdr *shdr,
 
 static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 {
+   const struct btf_type *container_type;
+   const struct btf_member *key, *value;
struct bpf_map_def *def = >def;
const size_t max_name = 256;
+   char container_name[max_name];
__s64 key_size, value_size;
-   __s32 key_id, value_id;
-   char name[max_name];
+   __s32 container_id;
 
-   /* Find key type by name from BTF */
-   if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
-   pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
+   if (snprintf(container_name, max_name, "btf_map_%s", map->name) ==
+   max_name) {
+   pr_warning("map:%s length of 'btf_map_%s' is too long\n",
   map->name, map->name);
return -EINVAL;
}
 
-   key_id = btf__find_by_name(btf, name);
-   if (key_id < 0) {
-   pr_debug("map:%s key_type:%s cannot be found in BTF\n",
-map->name, name);
-   return key_id;
+   container_id = btf__find_by_name(btf, container_name);
+   if (container_id < 0) {
+   pr_debug("map:%s container_name:%s cannot be found in BTF. 
Missing BPF_ANNOTATE_KV_PAIR?\n",
+map->name, container_name);
+   return container_id;
}
 
-   key_size = btf__resolve_size(btf, key_id);
-   if (key_size < 0) {
-   pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
-  map->name, name);
-   return key_size;
+   container_type = btf__type_by_id(btf, container_id);
+   if (!container_type) {
+   pr_warning("map:%s cannot fi

[PATCH v2 bpf 1/3] bpf: btf: Sync uapi btf.h to tools

2018-07-21 Thread Martin KaFai Lau
This patch sync the uapi btf.h to tools/

Fixes: 36fc3c8c282c bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h
Signed-off-by: Martin KaFai Lau 
---
 tools/include/uapi/linux/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 0b5ddbe135a4..972265f32871 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -76,7 +76,7 @@ struct btf_type {
  */
 #define BTF_INT_ENCODING(VAL)  (((VAL) & 0x0f00) >> 24)
 #define BTF_INT_OFFSET(VAL)(((VAL  & 0x00ff)) >> 16)
-#define BTF_INT_BITS(VAL)  ((VAL)  & 0x)
+#define BTF_INT_BITS(VAL)  ((VAL)  & 0x00ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
 #define BTF_INT_SIGNED (1 << 0)
-- 
2.17.1



Re: [PATCH bpf 3/3] bpf: Introduce BPF_ANNOTATE_KV_PAIR

2018-07-21 Thread Martin KaFai Lau
On Fri, Jul 20, 2018 at 06:39:33PM -0700, Martin KaFai Lau wrote:
> This patch introduces BPF_ANNOTATE_KV_PAIR to signal the
> bpf loader about the btf key_type and value_type of a bpf map.
> Please refer to the changes in test_btf_haskv.c for its usage.
> Both iproute2 and libbpf loader will then have the same
> convention to find out the map's btf_key_type_id and
> btf_value_type_id from a map's name.
> 
> Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
> Suggested-by: Daniel Borkmann 
> Signed-off-by: Martin KaFai Lau 
> ---
>  tools/lib/bpf/btf.c  |  8 +++
>  tools/lib/bpf/btf.h  |  1 +
>  tools/lib/bpf/libbpf.c   | 71 +++-
>  tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
>  tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
>  5 files changed, 59 insertions(+), 37 deletions(-)
> 
> diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
> index ce77b5b57912..748f0b11361d 100644
> --- a/tools/lib/bpf/btf.c
> +++ b/tools/lib/bpf/btf.c
> @@ -288,6 +288,14 @@ __s32 btf__find_by_name(const struct btf *btf, const 
> char *type_name)
>   return -ENOENT;
>  }
>  
> +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id)
> +{
> + if (!id || id > btf->nr_types)
> + return ERR_PTR(-EINVAL);
> +
> + return btf->types[id];
> +}
> +
>  void btf__free(struct btf *btf)
>  {
>   if (!btf)
> diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
> index ed3a84370ccc..38ebf66613e4 100644
> --- a/tools/lib/bpf/btf.h
> +++ b/tools/lib/bpf/btf.h
> @@ -16,6 +16,7 @@ typedef int (*btf_print_fn_t)(const char *, ...)
>  void btf__free(struct btf *btf);
>  struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
>  __s32 btf__find_by_name(const struct btf *btf, const char *type_name);
> +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
>  __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
>  int btf__fd(const struct btf *btf);
>  
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 6deb4fe4fffe..5ff7755efa6b 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -36,6 +36,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1014,63 +1015,69 @@ bpf_program__collect_reloc(struct bpf_program *prog, 
> GElf_Shdr *shdr,
>  
>  static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
>  {
> + const struct btf_type *container_type;
> + const struct btf_member *key, *value;
> + __s32 key_id, value_id, container_id;
>   struct bpf_map_def *def = >def;
>   const size_t max_name = 256;
> + char container_name[max_name];
>   __s64 key_size, value_size;
> - __s32 key_id, value_id;
> - char name[max_name];
>  
> - /* Find key type by name from BTF */
> - if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
> - pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
> + if (snprintf(container_name, max_name, "btf_map_%s", map->name) ==
> + max_name) {
> + pr_warning("map:%s length of 'btf_map_%s' is too long\n",
>  map->name, map->name);
>   return -EINVAL;
>   }
>  
> - key_id = btf__find_by_name(btf, name);
> - if (key_id < 0) {
> - pr_debug("map:%s key_type:%s cannot be found in BTF\n",
> -  map->name, name);
> - return key_id;
> + container_id = btf__find_by_name(btf, container_name);
> + if (container_id < 0) {
> + pr_debug("map:%s container_name:%s cannot be found in BTF. 
> Missing BPF_ANNOTATE_KV_PAIR?\n",
> +  map->name, container_name);
> + return container_id;
>   }
>  
> - key_size = btf__resolve_size(btf, key_id);
> - if (key_size < 0) {
> - pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
> -map->name, name);
> - return key_size;
> + container_type = btf__type_by_id(btf, container_id);
> + if (IS_ERR(container_type)) {
> + pr_warning("map:%s cannot find BTF type for container_id:%u\n",
> +map->name, container_id);
> + return PTR_ERR(container_type);
>   }
>  
> - if (def->key_size != key_size) {
> - pr_warning("map:%s key_type:%

[PATCH bpf 3/3] bpf: Introduce BPF_ANNOTATE_KV_PAIR

2018-07-20 Thread Martin KaFai Lau
This patch introduces BPF_ANNOTATE_KV_PAIR to signal the
bpf loader about the btf key_type and value_type of a bpf map.
Please refer to the changes in test_btf_haskv.c for its usage.
Both iproute2 and libbpf loader will then have the same
convention to find out the map's btf_key_type_id and
btf_value_type_id from a map's name.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Suggested-by: Daniel Borkmann 
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c  |  8 +++
 tools/lib/bpf/btf.h  |  1 +
 tools/lib/bpf/libbpf.c   | 71 +++-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 5 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index ce77b5b57912..748f0b11361d 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -288,6 +288,14 @@ __s32 btf__find_by_name(const struct btf *btf, const char 
*type_name)
return -ENOENT;
 }
 
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id)
+{
+   if (!id || id > btf->nr_types)
+   return ERR_PTR(-EINVAL);
+
+   return btf->types[id];
+}
+
 void btf__free(struct btf *btf)
 {
if (!btf)
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index ed3a84370ccc..38ebf66613e4 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -16,6 +16,7 @@ typedef int (*btf_print_fn_t)(const char *, ...)
 void btf__free(struct btf *btf);
 struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 __s32 btf__find_by_name(const struct btf *btf, const char *type_name);
+const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
 __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6deb4fe4fffe..5ff7755efa6b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1014,63 +1015,69 @@ bpf_program__collect_reloc(struct bpf_program *prog, 
GElf_Shdr *shdr,
 
 static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 {
+   const struct btf_type *container_type;
+   const struct btf_member *key, *value;
+   __s32 key_id, value_id, container_id;
struct bpf_map_def *def = >def;
const size_t max_name = 256;
+   char container_name[max_name];
__s64 key_size, value_size;
-   __s32 key_id, value_id;
-   char name[max_name];
 
-   /* Find key type by name from BTF */
-   if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
-   pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
+   if (snprintf(container_name, max_name, "btf_map_%s", map->name) ==
+   max_name) {
+   pr_warning("map:%s length of 'btf_map_%s' is too long\n",
   map->name, map->name);
return -EINVAL;
}
 
-   key_id = btf__find_by_name(btf, name);
-   if (key_id < 0) {
-   pr_debug("map:%s key_type:%s cannot be found in BTF\n",
-map->name, name);
-   return key_id;
+   container_id = btf__find_by_name(btf, container_name);
+   if (container_id < 0) {
+   pr_debug("map:%s container_name:%s cannot be found in BTF. 
Missing BPF_ANNOTATE_KV_PAIR?\n",
+map->name, container_name);
+   return container_id;
}
 
-   key_size = btf__resolve_size(btf, key_id);
-   if (key_size < 0) {
-   pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
-  map->name, name);
-   return key_size;
+   container_type = btf__type_by_id(btf, container_id);
+   if (IS_ERR(container_type)) {
+   pr_warning("map:%s cannot find BTF type for container_id:%u\n",
+  map->name, container_id);
+   return PTR_ERR(container_type);
}
 
-   if (def->key_size != key_size) {
-   pr_warning("map:%s key_type:%s has BTF type_size:%u != 
key_size:%u\n",
-  map->name, name, (unsigned int)key_size, 
def->key_size);
+   if (BTF_INFO_KIND(container_type->info) != BTF_KIND_STRUCT &&
+   BTF_INFO_VLEN(container_type->info) < 2) {
+   pr_warning("map:%s container_name:%s is an invalid container 
struct\n",
+  map->name, container_name);
return -EINVAL;
}
 
-   /* Find value type from BTF */
-   if (snprintf(name, max_name, "%s_va

[PATCH bpf 2/3] bpf: Replace [u]int32_t and [u]int64_t in libbpf

2018-07-20 Thread Martin KaFai Lau
This patch replaces [u]int32_t and [u]int64_t usage with
__[su]32 and __[su]64.  The same change goes for [u]int16_t
and [u]int8_t.

Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf")
Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c| 28 +---
 tools/lib/bpf/btf.h|  8 
 tools/lib/bpf/libbpf.c | 12 ++--
 tools/lib/bpf/libbpf.h |  4 ++--
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8c54a4b6f187..ce77b5b57912 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2,7 +2,6 @@
 /* Copyright (c) 2018 Facebook */
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -27,13 +26,13 @@ struct btf {
struct btf_type **types;
const char *strings;
void *nohdr_data;
-   uint32_t nr_types;
-   uint32_t types_size;
-   uint32_t data_size;
+   __u32 nr_types;
+   __u32 types_size;
+   __u32 data_size;
int fd;
 };
 
-static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
+static const char *btf_name_by_offset(const struct btf *btf, __u32 offset)
 {
if (offset < btf->hdr->str_len)
return >strings[offset];
@@ -151,7 +150,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 
while (next_type < end_type) {
struct btf_type *t = next_type;
-   uint16_t vlen = BTF_INFO_VLEN(t->info);
+   __u16 vlen = BTF_INFO_VLEN(t->info);
int err;
 
next_type += sizeof(*t);
@@ -191,7 +190,7 @@ static int btf_parse_type_sec(struct btf *btf, 
btf_print_fn_t err_log)
 }
 
 static const struct btf_type *btf_type_by_id(const struct btf *btf,
-uint32_t type_id)
+__u32 type_id)
 {
if (type_id > btf->nr_types)
return NULL;
@@ -226,12 +225,12 @@ static int64_t btf_type_size(const struct btf_type *t)
 
 #define MAX_RESOLVE_DEPTH 32
 
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
+__s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 {
const struct btf_array *array;
const struct btf_type *t;
-   uint32_t nelems = 1;
-   int64_t size = -1;
+   __u32 nelems = 1;
+   __s64 size = -1;
int i;
 
t = btf_type_by_id(btf, type_id);
@@ -271,9 +270,9 @@ int64_t btf__resolve_size(const struct btf *btf, uint32_t 
type_id)
return nelems * size;
 }
 
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
 {
-   uint32_t i;
+   __u32 i;
 
if (!strcmp(type_name, "void"))
return 0;
@@ -302,10 +301,9 @@ void btf__free(struct btf *btf)
free(btf);
 }
 
-struct btf *btf__new(uint8_t *data, uint32_t size,
-btf_print_fn_t err_log)
+struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log)
 {
-   uint32_t log_buf_size = 0;
+   __u32 log_buf_size = 0;
char *log_buf = NULL;
struct btf *btf;
int err;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 74bb344035bb..ed3a84370ccc 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -4,7 +4,7 @@
 #ifndef __BPF_BTF_H
 #define __BPF_BTF_H
 
-#include 
+#include 
 
 #define BTF_ELF_SEC ".BTF"
 
@@ -14,9 +14,9 @@ typedef int (*btf_print_fn_t)(const char *, ...)
__attribute__((format(printf, 1, 2)));
 
 void btf__free(struct btf *btf);
-struct btf *btf__new(uint8_t *data, uint32_t size, btf_print_fn_t err_log);
-int32_t btf__find_by_name(const struct btf *btf, const char *type_name);
-int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id);
+struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name);
+__s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1e96b5de5ff..6deb4fe4fffe 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -216,8 +216,8 @@ struct bpf_map {
size_t offset;
int map_ifindex;
struct bpf_map_def def;
-   uint32_t btf_key_type_id;
-   uint32_t btf_value_type_id;
+   __u32 btf_key_type_id;
+   __u32 btf_value_type_id;
void *priv;
bpf_map_clear_priv_t clear_priv;
 };
@@ -1016,8 +1016,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, 
const struct btf *btf)
 {
struct bpf_map_def *def = >def;
const size_t max_name = 256;
-   int64_t key_size, value_size;
-   int32_t key_id, value_id;
+   __s64 key_size, value_size;
+   __s32 key_id, value_id;
char name[max_name];
 
/* Fin

[PATCH bpf 1/3] bpf: btf: Sync uapi btf.h to tools

2018-07-20 Thread Martin KaFai Lau
This patch sync the uapi btf.h to tools/

Fixes: 36fc3c8c282c bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h
Signed-off-by: Martin KaFai Lau 
---
 tools/include/uapi/linux/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 0b5ddbe135a4..972265f32871 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -76,7 +76,7 @@ struct btf_type {
  */
 #define BTF_INT_ENCODING(VAL)  (((VAL) & 0x0f00) >> 24)
 #define BTF_INT_OFFSET(VAL)(((VAL  & 0x00ff)) >> 16)
-#define BTF_INT_BITS(VAL)  ((VAL)  & 0x)
+#define BTF_INT_BITS(VAL)  ((VAL)  & 0x00ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
 #define BTF_INT_SIGNED (1 << 0)
-- 
2.17.1



[PATCH bpf 0/3] Introduce BPF_ANNOTATE_KV_PAIR

2018-07-20 Thread Martin KaFai Lau
The series allows the BPF loader to figure out
the btf_key_id and btf_value_id from a map's name
by using BPF_ANNOTATE_KV_PAIR.  It also removes
the old 'typedef' way which requires two separate
typedefs (one for the key and one for the value).

By doing this, iproute2 and libbpf have one
consistent way to figure out the btf_key_type_id and
btf_value_type_id for a map.

The first two patches are some prep/cleanup works.
The last patch introduces BPF_ANNOTATE_KV_PAIR.

Martin KaFai Lau (3):
  bpf: btf: Sync uapi btf.h to tools
  bpf: Replace [u]int32_t and [u]int64_t in libbpf
  bpf: Introduce BPF_ANNOTATE_KV_PAIR

 tools/include/uapi/linux/btf.h   |  2 +-
 tools/lib/bpf/btf.c  | 36 +
 tools/lib/bpf/btf.h  |  9 ++-
 tools/lib/bpf/libbpf.c   | 81 +++-
 tools/lib/bpf/libbpf.h   |  4 +-
 tools/testing/selftests/bpf/bpf_helpers.h|  9 +++
 tools/testing/selftests/bpf/test_btf_haskv.c |  7 +-
 7 files changed, 84 insertions(+), 64 deletions(-)

-- 
2.17.1



[PATCH bpf] bpf: btf: Ensure the member->offset is in the right order

2018-07-20 Thread Martin KaFai Lau
This patch ensures the member->offset of a struct
is in the correct order (i.e the later member's offset cannot
go backward).

The current "pahole -J" BTF encoder does not generate something
like this.  However, checking this can ensure future encoder
will not violate this.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/btf.c   | 14 -
 tools/testing/selftests/bpf/test_btf.c | 28 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9704934252b3..2590700237c1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
 {
bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
const struct btf_member *member;
+   u32 meta_needed, last_offset;
struct btf *btf = env->btf;
u32 struct_size = t->size;
-   u32 meta_needed;
u16 i;
 
meta_needed = btf_type_vlen(t) * sizeof(*member);
@@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
 
btf_verifier_log_type(env, t, NULL);
 
+   last_offset = 0;
for_each_member(i, t, member) {
if (!btf_name_offset_valid(btf, member->name_off)) {
btf_verifier_log_member(env, t, member,
@@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /*
+* ">" instead of ">=" because the last member could be
+* "char a[0];"
+*/
+   if (last_offset > member->offset) {
+   btf_verifier_log_member(env, t, member,
+   "Invalid member bits_offset");
+   return -EINVAL;
+   }
+
if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
btf_verifier_log_member(env, t, member,
"Memmber bits_offset exceeds 
its struct size");
@@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
}
 
btf_verifier_log_member(env, t, member, NULL);
+   last_offset = member->offset;
}
 
return meta_needed;
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 3619f3023088..402c0f7cc418 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -247,6 +247,34 @@ static struct btf_raw_test raw_tests[] = {
.max_entries = 4,
 },
 
+{
+   .descr = "struct test #3 Invalid member offset",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   /* int64 */ /* [2] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8),
+
+   /* struct A { *//* [3] */
+   BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 16),
+   BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int m;   
*/
+   BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* int64 n; */
+   /* } */
+   BTF_END_RAW,
+   },
+   .str_sec = "\0A\0m\0n\0",
+   .str_sec_size = sizeof("\0A\0m\0n\0"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "struct_test3_map",
+   .key_size = sizeof(int),
+   .value_size = 16,
+   .key_type_id = 1,
+   .value_type_id = 3,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid member bits_offset",
+},
+
 /* Test member exceeds the size of struct.
  *
  * struct A {
-- 
2.17.1



Re: [PATCH bpf] xdp: add NULL pointer check in __xdp_return()

2018-07-20 Thread Martin KaFai Lau
On Sat, Jul 21, 2018 at 01:04:45AM +0900, Taehee Yoo wrote:
> rhashtable_lookup() can return NULL. so that NULL pointer
> check routine should be added.
> 
> Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY")
> Signed-off-by: Taehee Yoo 
> ---
>  net/core/xdp.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 9d1f220..1c12bc7 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -345,7 +345,8 @@ static void __xdp_return(void *data, struct xdp_mem_info 
> *mem, bool napi_direct,
>   rcu_read_lock();
>   /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
>   xa = rhashtable_lookup(mem_id_ht, >id, mem_id_rht_params);
> - xa->zc_alloc->free(xa->zc_alloc, handle);
> + if (xa)
> + xa->zc_alloc->free(xa->zc_alloc, handle);
hmm...It is not clear to me the "!xa" case don't have to be handled?

>   rcu_read_unlock();
>   default:
>   /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
> -- 
> 2.9.3
> 


[PATCH bpf] bpf: Use option "help" in the llvm-objcopy test

2018-07-19 Thread Martin KaFai Lau
I noticed the "--version" option of the llvm-objcopy command has recently
disappeared from the master llvm branch.  It is currently used as a BTF
support test in tools/testing/selftests/bpf/Makefile.

This patch replaces it with "--help" which should be
less error prone in the future.

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 7a6214e9ae58..a362e3d7abc6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -105,7 +105,7 @@ $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
-BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --version 2>&1 | grep LLVM)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
'usage.*llvm')
 
 ifneq ($(BTF_LLC_PROBE),)
 ifneq ($(BTF_PAHOLE_PROBE),)
-- 
2.17.1



[PATCH bpf] bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h

2018-07-19 Thread Martin KaFai Lau
This patch shrinks the BTF_INT_BITS() mask.  The current
btf_int_check_meta() ensures the nr_bits of an integer
cannot exceed 64.  Hence, it is mostly an uapi cleanup.

The actual btf usage (i.e. seq_show()) is also modified
to use u8 instead of u16.  The verification (e.g. btf_int_check_meta())
path stays as is to deal with invalid BTF situation.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Signed-off-by: Martin KaFai Lau 
---
 include/uapi/linux/btf.h |  2 +-
 kernel/bpf/btf.c | 16 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 0b5ddbe135a4..972265f32871 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -76,7 +76,7 @@ struct btf_type {
  */
 #define BTF_INT_ENCODING(VAL)  (((VAL) & 0x0f00) >> 24)
 #define BTF_INT_OFFSET(VAL)(((VAL  & 0x00ff)) >> 16)
-#define BTF_INT_BITS(VAL)  ((VAL)  & 0x)
+#define BTF_INT_BITS(VAL)  ((VAL)  & 0x00ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
 #define BTF_INT_SIGNED (1 << 0)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e016ac3afa24..9704934252b3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct 
btf *btf, u32 type_id)
  */
 static bool btf_type_int_is_regular(const struct btf_type *t)
 {
-   u16 nr_bits, nr_bytes;
+   u8 nr_bits, nr_bytes;
u32 int_data;
 
int_data = btf_type_int(t);
@@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 {
u16 left_shift_bits, right_shift_bits;
u32 int_data = btf_type_int(t);
-   u16 nr_bits = BTF_INT_BITS(int_data);
-   u16 total_bits_offset;
-   u16 nr_copy_bytes;
-   u16 nr_copy_bits;
+   u8 nr_bits = BTF_INT_BITS(int_data);
+   u8 total_bits_offset;
+   u8 nr_copy_bytes;
+   u8 nr_copy_bits;
u64 print_num;
 
+   /*
+* bits_offset is at most 7.
+* BTF_INT_OFFSET() cannot exceed 64 bits.
+*/
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
@@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const 
struct btf_type *t,
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
-   u32 nr_bits = BTF_INT_BITS(int_data);
+   u8 nr_bits = BTF_INT_BITS(int_data);
 
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
-- 
2.17.1



Re: [PATCH iproute2 5/5] bpf: implement btf handling and map annotation

2018-07-19 Thread Martin KaFai Lau
On Thu, Jul 19, 2018 at 05:43:11PM +0200, Daniel Borkmann wrote:
> On 07/19/2018 02:11 AM, Martin KaFai Lau wrote:
> > On Wed, Jul 18, 2018 at 11:13:37AM -0700, Jakub Kicinski wrote:
> >> On Wed, 18 Jul 2018 11:33:22 +0200, Daniel Borkmann wrote:
> >>> On 07/18/2018 10:42 AM, Daniel Borkmann wrote:
> >>>> On 07/18/2018 02:27 AM, Jakub Kicinski wrote:  
> >>>>> On Wed, 18 Jul 2018 01:31:22 +0200, Daniel Borkmann wrote:  
> >>>>>>   # bpftool map dump id 386
> >>>>>>[{
> >>>>>> "key": 0,
> >>>>>> "value": {
> >>>>>> "": {
> >>>>>> "value": 0,
> >>>>>> "ifindex": 0,
> >>>>>> "mac": []
> >>>>>> }
> >>>>>> }
> >>>>>> },{
> >>>>>> "key": 1,
> >>>>>> "value": {
> >>>>>> "": {
> >>>>>> "value": 0,
> >>>>>> "ifindex": 0,
> >>>>>> "mac": []
> >>>>>> }
> >>>>>> }
> >>>>>> },{
> >>>>>>   [...]  
> >>>>>
> >>>>> Ugh, the empty keys ("") look worrying, we should probably improve
> >>>>> handling of anonymous structs in bpftool :S  
> >>>>
> >>>> Yeah agree, I think it would be nice to see a more pahole style dump
> >>>> where we have types and member names along with the value as otherwise
> >>>> it might be a bit confusing.  
> >>>
> >>> Another feature that would be super useful imho would be in the /single/
> >>> map view e.g. 'bpftool map show id 123' to have a detailed BTF key+value
> >>> type dump, so in addition to the basic map info we show pahole like info
> >>> of the structs with length/offsets.
> >>
> >> That sounds good!  We could also consider adding a btf object and
> >> commands to interrogate BTF types in the kernel in general..  Perhaps
> >> then we could add something like bpftool btf describe map id 123.
> > +1 on the btf subcommand.
> 
> That would also work, I think both might be useful to have. Former would
> all sit under a single command to show map details.
Agree that both would be useful.  btf command could address the whole BTF
object which could include many maps/types while the map command is
focusing on its own map info.

> With 'bpftool btf' you
> would also allow for a full BTF dump when a specific BTF obj id is provided?
Right, I think the BTF obj id (or file) is needed for the btf command.  and then
it should allow to do full dump or only show a particular map/type id.

A little forward thinking, map here is a C type.  Hence, I think using the
name "type" like "btftool btf id 1 show _type_ id 123" may be better when
we later expand BTF usage beyond BPF program.

> 
> >> Having the single map view show more information seems interesting, but
> >> I wonder if it could be surprising.  Is there precedent for such
> >> behaviour?
> > Having everything in one page (map show id 123) could be interesting.
> > One thing is the pahole-like output may be quite long?
> > e.g. the member of a struct could itself be another struct.
> 
> Right, though probably fine when you want to see all information specific
> to one map. Of course the 'bpftool map' list view would need to hide this
> information.
> 
> > Not sure how the pahole-like output may look like in json though.
> 
> Would the existing map k/v dump have more or less the same 'issue'?
True, the existing map k/v dump of the map data is reusing the
json {}/[]/""/number convention.  I think that is ok and actually a
pretty condensed way since people are used to this convention when
reading "data" output.

For printing out C type, I think it is more natural to have it as
close to C syntax as possible in order to have it parsable by human
eyes.  However, yes, we could reuse a similar fashion to print type
in json as we do in printing data.  Just curious, the json type output
is more for script or mostly for people that can read everything from one
json output.

For plaintext, we can just print like pahole.


Re: [PATCH iproute2 5/5] bpf: implement btf handling and map annotation

2018-07-18 Thread Martin KaFai Lau
On Wed, Jul 18, 2018 at 11:13:37AM -0700, Jakub Kicinski wrote:
> On Wed, 18 Jul 2018 11:33:22 +0200, Daniel Borkmann wrote:
> > On 07/18/2018 10:42 AM, Daniel Borkmann wrote:
> > > On 07/18/2018 02:27 AM, Jakub Kicinski wrote:  
> > >> On Wed, 18 Jul 2018 01:31:22 +0200, Daniel Borkmann wrote:  
> > >>>   # bpftool map dump id 386
> > >>>[{
> > >>> "key": 0,
> > >>> "value": {
> > >>> "": {
> > >>> "value": 0,
> > >>> "ifindex": 0,
> > >>> "mac": []
> > >>> }
> > >>> }
> > >>> },{
> > >>> "key": 1,
> > >>> "value": {
> > >>> "": {
> > >>> "value": 0,
> > >>> "ifindex": 0,
> > >>> "mac": []
> > >>> }
> > >>> }
> > >>> },{
> > >>>   [...]  
> > >>
> > >> Ugh, the empty keys ("") look worrying, we should probably improve
> > >> handling of anonymous structs in bpftool :S  
> > > 
> > > Yeah agree, I think it would be nice to see a more pahole style dump
> > > where we have types and member names along with the value as otherwise
> > > it might be a bit confusing.  
> > 
> > Another feature that would be super useful imho would be in the /single/
> > map view e.g. 'bpftool map show id 123' to have a detailed BTF key+value
> > type dump, so in addition to the basic map info we show pahole like info
> > of the structs with length/offsets.
> 
> That sounds good!  We could also consider adding a btf object and
> commands to interrogate BTF types in the kernel in general..  Perhaps
> then we could add something like bpftool btf describe map id 123.
+1 on the btf subcommand.

> 
> Having the single map view show more information seems interesting, but
> I wonder if it could be surprising.  Is there precedent for such
> behaviour?
Having everything in one page (map show id 123) could be interesting.
One thing is the pahole-like output may be quite long?
e.g. the member of a struct could itself be another struct.

Not sure how the pahole-like output may look like in json though.


Re: [PATCH iproute2 5/5] bpf: implement btf handling and map annotation

2018-07-18 Thread Martin KaFai Lau
On Tue, Jul 17, 2018 at 05:27:43PM -0700, Jakub Kicinski wrote:
> On Wed, 18 Jul 2018 01:31:22 +0200, Daniel Borkmann wrote:
> >   # bpftool map dump id 386
> >[{
> > "key": 0,
> > "value": {
> > "": {
> > "value": 0,
> > "ifindex": 0,
> > "mac": []
> > }
> > }
> > },{
> > "key": 1,
> > "value": {
> > "": {
> > "value": 0,
> > "ifindex": 0,
> > "mac": []
> > }
> > }
> > },{
> >   [...]
> 
> Ugh, the empty keys ("") look worrying, we should probably improve
> handling of anonymous structs in bpftool :S
Note that the kernel's btf_verifier_log is using "(anon)" in this case.
Not sure if it is a good idea for json.


Re: [PATCH bpf-next] bpf: show in bpftool map overview whether btf is available

2018-07-18 Thread Martin KaFai Lau
On Wed, Jul 18, 2018 at 11:19:42AM +0200, Daniel Borkmann wrote:
> For a quick overview in 'bpftool map' display 'btf' if it's
> available for the dump for a specific map:
> 
>   # bpftool map list
>   11: array  flags 0x0  btf
>   key 4B  value 20B  max_entries 40  memlock 4096B
> 
>   # bpftool --json --pretty map list
>   [{
>   "id": 11,
>   "type": "array",
>   "flags": 0,
>   "btf_available": true,
>   "bytes_key": 4,
>   "bytes_value": 20,
>   "max_entries": 40,
>   "bytes_memlock": 4096
>   }
>   ]
> 
> Signed-off-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 


Re: [bpf PATCH v4 3/4] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-25 Thread Martin KaFai Lau
On Mon, Jun 25, 2018 at 08:34:17AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting maps list the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>  [...]
>  spin_lock(bucket_lock)
>  l = lookup_elem_raw()
>  if (l)
> hlist_del_rcu()
> write_lock(sk_callback_lock)
>   destroy psock ...
> write_unlock(sk_callback_lock)
>  spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>  [...]
>  write_lock(sk_callback_lock)
>  for each psock->maps // list of maps this sock is part of
>  hlist_del_rcu(ref_hash_node);
>   destroy psock ...
>  write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules the psocks maps list needs the
> sk_callback_lock but we need the bucket lock to do the hlist_del_rcu.
> To resolve the lock inversion problem pop the head of the maps list
> repeatedly and remove the reference until no more are left. If a
> delete happens in parallel from the BPF API that is OK as well because
> it will do a similar action, lookup the lock in the map/hash, delete
> it from the map/hash, and dec the refcnt. We check for this case
> before doing a destroy on the psock to ensure we don't have two
> threads tearing down a psock. The new logic is as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>  //only get here if elmnt was not already removed
>  hlist_del_rcu()
>  ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list. Then add RCU annotations and
> use rcu_dereference/rcu_assign_pointer to manage values relying on
> RCU so that the object is not free'd from sock_hash_free() while it
> is being referenced in bpf_tcp_close().
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
Acked-by: Martin KaFai Lau 


Re: [bpf PATCH v3 3/4] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:44AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting maps list the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>  [...]
>  spin_lock(bucket_lock)
>  l = lookup_elem_raw()
>  if (l)
> hlist_del_rcu()
> write_lock(sk_callback_lock)
>   destroy psock ...
> write_unlock(sk_callback_lock)
>  spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>  [...]
>  write_lock(sk_callback_lock)
>  for each psock->maps // list of maps this sock is part of
>  hlist_del_rcu(ref_hash_node);
>   destroy psock ...
>  write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules the psocks maps list needs the
> sk_callback_lock but we need the bucket lock to do the hlist_del_rcu.
> To resolve the lock inversion problem pop the head of the maps list
> repeatedly and remove the reference until no more are left. If a
> delete happens in parallel from the BPF API that is OK as well because
> it will do a similar action, lookup the lock in the map/hash, delete
> it from the map/hash, and dec the refcnt. We check for this case
> before doing a destroy on the psock to ensure we don't have two
> threads tearing down a psock. The new logic is as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>  //only get here if elmnt was not already removed
>  hlist_del_rcu()
>  ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list. Then add RCU annotations and
> use rcu_dereference/rcu_assign_pointer to manage values relying on
> RCU so that the object is not free'd from sock_hash_free() while it
> is being referenced in bpf_tcp_close().
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
> ---
>  kernel/bpf/sockmap.c |  120 
> +++---
>  1 file changed, 84 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 69b26af..333427b 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -72,6 +72,7 @@ struct bpf_htab {
>   u32 n_buckets;
>   u32 elem_size;
>   struct bpf_sock_progs progs;
> + struct rcu_head rcu;
>  };
>  
>  struct htab_elem {
> @@ -89,8 +90,8 @@ enum smap_psock_state {
>  struct smap_psock_map_entry {
>   struct list_head list;
>   struct sock **entry;
> - struct htab_elem *hash_link;
> - struct bpf_htab *htab;
> + struct htab_elem __rcu *hash_link;
> + struct bpf_htab __rcu *htab;
>  };
>  
>  struct smap_psock {
> @@ -258,16 +259,54 @@ static void bpf_tcp_release(struct sock *sk)
>   rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +  u32 hash, void *key, u32 key_size)
> +{
> + struct htab_elem *l;
> +
> + hlist_for_each_entry_rcu(l, head, hash_node) {
> + if (l->hash == hash && !memcmp(>key, key, key_size))
> + return l;
> + }
> +
> + return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> + return >buckets[hash & 

Re: [bpf PATCH v3 2/4] bpf: sockmap, fix smap_list_map_remove when psock is in many maps

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:39AM -0700, John Fastabend wrote:
> If a hashmap is free'd with open socks it removes the reference to
> the hash entry from the psock. If that is the last reference to the
> psock then it will also be free'd by the reference counting logic.
> However the current logic that removes the hash reference from the
> list of references is broken. In map_list_map_remove() we first check
s/map_list_map_remove/smap_list_remove/

> if the sockmap entry matches and then check if the hashmap entry
> matches. But, the sockmap entry sill always match because its NULL in
> this case which causes the first entry to be removed from the list.
> If this is always the "right" entry (because the user adds/removes
> entries in order) then everything is OK but otherwise a subsequent
> bpf_tcp_close() may reference a free'd object.
> 
> To fix this create two list handlers one for sockmap and one for
> sockhash.
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
One nit.  Other than that,

Acked-by: Martin KaFai Lau 

> ---
>  kernel/bpf/sockmap.c |   33 +
>  1 file changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index d7fd17a..69b26af 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -1602,17 +1602,26 @@ static struct bpf_map *sock_map_alloc(union bpf_attr 
> *attr)
>   return ERR_PTR(err);
>  }
>  
> -static void smap_list_remove(struct smap_psock *psock,
> -  struct sock **entry,
> -  struct htab_elem *hash_link)
> +static void smap_list_map_remove(struct smap_psock *psock,
> +  struct sock **entry)
>  {
>   struct smap_psock_map_entry *e, *tmp;
>  
>   list_for_each_entry_safe(e, tmp, >maps, list) {
> - if (e->entry == entry || e->hash_link == hash_link) {
> + if (e->entry == entry)
> + list_del(>list);
> + }
> +}
Nit. Add an empty line.

> +static void smap_list_hash_remove(struct smap_psock *psock,
> +   struct htab_elem *hash_link)
> +{
> + struct smap_psock_map_entry *e, *tmp;
> +
> + list_for_each_entry_safe(e, tmp, >maps, list) {
> + struct htab_elem *c = e->hash_link;
> +
> + if (c == hash_link)
>   list_del(>list);
> - break;
> - }
>   }
>  }
>  
> @@ -1647,7 +1656,7 @@ static void sock_map_free(struct bpf_map *map)
>* to be null and queued for garbage collection.
>*/
>   if (likely(psock)) {
> - smap_list_remove(psock, >sock_map[i], NULL);
> + smap_list_map_remove(psock, >sock_map[i]);
>   smap_release_sock(psock, sock);
>   }
>   write_unlock_bh(>sk_callback_lock);
> @@ -1706,7 +1715,7 @@ static int sock_map_delete_elem(struct bpf_map *map, 
> void *key)
>  
>   if (psock->bpf_parse)
>   smap_stop_sock(psock, sock);
> - smap_list_remove(psock, >sock_map[k], NULL);
> + smap_list_map_remove(psock, >sock_map[k]);
>   smap_release_sock(psock, sock);
>  out:
>   write_unlock_bh(>sk_callback_lock);
> @@ -1908,7 +1917,7 @@ static int sock_map_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   struct smap_psock *opsock = smap_psock_sk(osock);
>  
>   write_lock_bh(>sk_callback_lock);
> - smap_list_remove(opsock, >sock_map[i], NULL);
> + smap_list_map_remove(opsock, >sock_map[i]);
>   smap_release_sock(opsock, osock);
>   write_unlock_bh(>sk_callback_lock);
>   }
> @@ -2124,7 +2133,7 @@ static void sock_hash_free(struct bpf_map *map)
>* (psock) to be null and queued for garbage collection.
>*/
>   if (likely(psock)) {
> - smap_list_remove(psock, NULL, l);
> + smap_list_hash_remove(psock, l);
>   smap_release_sock(psock, sock);
>   }
>   write_unlock_bh(>sk_callback_lock);
> @@ -2304,7 +2313,7 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   psock = smap_psock_sk(l_old->sk);
>  
>   hlist_del_rcu(_old->hash_node);
> - smap_list_

Re: [bpf PATCH v3 1/4] bpf: sockmap, fix crash when ipv6 sock is added

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:34AM -0700, John Fastabend wrote:
> This fixes a crash where we assign tcp_prot to IPv6 sockets instead
> of tcpv6_prot.
> 
> Previously we overwrote the sk->prot field with tcp_prot even in the
> AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot
> are used.
> 
> Tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
> 'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously
> crashing case here.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Reported-by: syzbot+5c063698bdbfac19f...@syzkaller.appspotmail.com
> Signed-off-by: John Fastabend 
> Signed-off-by: Wei Wang 
Acked-by: Martin KaFai Lau 

> ---
>  kernel/bpf/sockmap.c |   58 
> +-
>  1 file changed, 48 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 52a91d8..d7fd17a 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>   int offset, size_t size, int flags);
> +static void bpf_tcp_close(struct sock *sk, long timeout);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -161,7 +162,42 @@ static bool bpf_tcp_stream_read(const struct sock *sk)
>   return !empty;
>  }
>  
> -static struct proto tcp_bpf_proto;
> +enum {
> + SOCKMAP_IPV4,
> + SOCKMAP_IPV6,
> + SOCKMAP_NUM_PROTS,
> +};
> +
> +enum {
> + SOCKMAP_BASE,
> + SOCKMAP_TX,
> + SOCKMAP_NUM_CONFIGS,
> +};
> +
> +static struct proto *saved_tcpv6_prot __read_mostly;
> +static DEFINE_SPINLOCK(tcpv6_prot_lock);
> +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
> +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
> +  struct proto *base)
> +{
> + prot[SOCKMAP_BASE]  = *base;
> + prot[SOCKMAP_BASE].close= bpf_tcp_close;
> + prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
> + prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
> +
> + prot[SOCKMAP_TX]= prot[SOCKMAP_BASE];
> + prot[SOCKMAP_TX].sendmsg= bpf_tcp_sendmsg;
> + prot[SOCKMAP_TX].sendpage   = bpf_tcp_sendpage;
> +}
> +
> +static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
> +{
> + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
> + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
> +
> + sk->sk_prot = _tcp_prots[family][conf];
> +}
> +
>  static int bpf_tcp_init(struct sock *sk)
>  {
>   struct smap_psock *psock;
> @@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk)
>   psock->save_close = sk->sk_prot->close;
>   psock->sk_proto = sk->sk_prot;
>  
> - if (psock->bpf_tx_msg) {
> - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
> - tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
> - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
> - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
> + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> + if (sk->sk_family == AF_INET6 &&
> + unlikely(sk->sk_prot != smp_load_acquire(_tcpv6_prot))) {
> + spin_lock_bh(_prot_lock);
> + if (likely(sk->sk_prot != saved_tcpv6_prot)) {
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
> + smp_store_release(_tcpv6_prot, sk->sk_prot);
> + }
> + spin_unlock_bh(_prot_lock);
>   }
> -
> - sk->sk_prot = _bpf_proto;
> + update_sk_prot(sk, psock);
>   rcu_read_unlock();
>   return 0;
>  }
> @@ -,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
>  
>  static int bpf_tcp_ulp_register(void)
>  {
> - tcp_bpf_proto = tcp_prot;
> - tcp_bpf_proto.close = bpf_tcp_close;
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], _prot);
>   /* Once BPF TX ULP is registered it is never unregistered. It
>* will be in the ULP list for the lifetime of the system. Doing
>* duplicate registers is not a problem.
> 


Re: [PATCH v2 bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-21 Thread Martin KaFai Lau
On Wed, Jun 20, 2018 at 08:00:11PM -0700, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
> 
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
> 
> The fib result for unicast routes must have an egress device, so remove
> the check that it is non-NULL.
Acked-by: Martin KaFai Lau 

> 
> Signed-off-by: David Ahern 
> ---
> v2
> - drop BPF_FIB_LKUP_RET_NO_NHDEV; check in dev in fib result not needed
> - enhance documentation of BPF_FIB_LKUP_RET_ codes
> 
>  include/uapi/linux/bpf.h   | 28 ++
>  net/core/filter.c  | 72 
> ++
>  samples/bpf/xdp_fwd_kern.c |  8 +++---
>  3 files changed, 74 insertions(+), 34 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..b7db3261c62d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
>   *   is resolved), the nexthop address is returned in ipv4_dst
>   *   or ipv6_dst based on family, smac is set to mac address of
>   *   egress device, dmac is set to nexthop mac address, rt_metric
> - *   is set to metric from route (IPv4/IPv6 only).
> + *   is set to metric from route (IPv4/IPv6 only), and ifindex
> + *   is set to the device index of the nexthop from the FIB lookup.
>   *
>   * *plen* argument is the size of the passed in struct.
>   * *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,10 @@ union bpf_attr {
>   * *ctx* is either **struct xdp_md** for XDP programs or
>   * **struct sk_buff** tc cls_act programs.
>   * Return
> - * Egress device index on success, 0 if packet needs to continue
> - * up the stack for further processing or a negative error in 
> case
> - * of failure.
> + *   * < 0 if any input argument is invalid
> + *   *   0 on success (packet is forwarded, nexthop neighbor exists)
> + *   * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
> + *   * packet is not forwarded or needs assist from full stack
>   *
>   * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map 
> *map, void *key, u64 flags)
>   *   Description
> @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args {
>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>  
> +enum {
> + BPF_FIB_LKUP_RET_SUCCESS,  /* lookup successful */
> + BPF_FIB_LKUP_RET_BLACKHOLE,/* dest is blackholed; can be dropped */
> + BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable; can be dropped */
> + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */
> + BPF_FIB_LKUP_RET_NOT_FWDED,/* packet is not forwarded */
> + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
> + BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
> + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */
> + BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
> +};
> +
>  struct bpf_fib_lookup {
>   /* input:  network family for lookup (AF_INET, AF_INET6)
>* output: network family of egress nexthop
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>  
>   /* total length of packet from network header - used for MTU check */
>   __u16   tot_len;
> - __u32   ifindex;  /* L3 device index for lookup */
> +
> + /* input: L3 device index for lookup
> +  * output: device index from FIB lookup
> +  */
> + __u32   ifindex;
>  
>   union {
>   /* inputs to lookup */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e7f12e9f598c..f8dd8aa89de4 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup 
> *params,
>   memcpy(params->smac, dev->dev_addr, ETH_ALEN);
>   params->h_vlan_TCI = 0;
>   params->h_vlan_proto = 0;
> + params->ifindex = dev->ifindex;
>  
> - return dev->ifindex;
> + return 0;
>  }
>  #endif
>  
> @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net,

Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-19 Thread Martin KaFai Lau
On Tue, Jun 19, 2018 at 02:16:53PM -0600, David Ahern wrote:
> On 6/19/18 10:36 AM, Martin KaFai Lau wrote:
> > On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
> >> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> >>> On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> >>>> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>>>>>/* rc > 0 case */
> >>>>>>switch(rc) {
> >>>>>>case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>>>>>case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>>>>>case BPF_FIB_LKUP_RET_PROHIBIT:
> >>>>>>return XDP_DROP;
> >>>>>>}
> >>>>>>
> >>>>>> For the others it becomes a question of do we share why the stack needs
> >>>>>> to be involved? Maybe the program wants to collect stats to show 
> >>>>>> traffic
> >>>>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >>>>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >>>>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> >>>>> Thanks for the explanation.
> >>>>>
> >>>>> Agree on the bpf able to collect stats will be useful.
> >>>>>
> >>>>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> >>>>> how may the old xdp_prog work/not-work?  As of now, the return value
> >>>>> is straight forward, FWD, PASS (to stack) or DROP (error).
> >>>>> With this change, the xdp_prog needs to match/switch() the
> >>>>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> >>>>
> >>>> IMO, programs should only call XDP_DROP for known reasons - like the 3
> >>>> above. Anything else punt to the stack.
> >>>>
> >>>> If a new RET_XYZ comes along:
> >>>> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> >>>> If the program does not understand XYZ and punts to the stack
> >>>> (recommendation), then a second lookup is done during normal packet
> >>>> processing and the stack drops it.
> >>>>
> >>>> 2. the new XYZ is a new path in the kernel that is unsupported with
> >>>> respect to XDP forwarding, nothing new for the program to do.
> >>>>
> >>>> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> >>>> the program writer.
> >>>>
> >>>> Worst case of punting packets to the stack for any rc != 0 means the
> >>>> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> >>>> in normal stack processing - to handle the packet.
> >>> Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> >>> should the bpf_*_fib_lookup() return value be kept as is such that
> >>> the xdp_prog is clear what to do.  The reason can be returned in
> >>> the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> >>> If the xdp_prog does not understand a reason, it still will not
> >>> affect its decision because the return value is clear.
> >>> I think the situation here is similar to regular syscall which usually
> >>> uses -1 to clearly states error and errno to spells out the reason.
> >>>
> >>
> >> I did consider returning the status in struct bpf_fib_lookup. However,
> >> it is 64 bytes and can not be extended without a big performance
> >> penalty, so the only option there is to make an existing entry a union
> >> the most logical of which is the ifindex. It seemed odd to me to have
> >> the result by hidden in the struct as a union on ifindex and returning
> >> the egress index from the function:
> >>
> >> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
> >>
> >> /* total length of packet from network header - used for MTU
> >> check */
> >> __u16   tot_len;
> >> -   __u32   ifindex;  /* L3 device index for lookup */
> >> +
> >> +   union {
> >> +   __u32   ifindex;  /* input: L3 device index for lookup */
> >> +   __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
> >> +   };
> >>
> >>
> >> It seemed more natural to have ifindex

Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-19 Thread Martin KaFai Lau
On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> > On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> >> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>>>  /* rc > 0 case */
> >>>>  switch(rc) {
> >>>>  case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>>>  case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>>>  case BPF_FIB_LKUP_RET_PROHIBIT:
> >>>>  return XDP_DROP;
> >>>>  }
> >>>>
> >>>> For the others it becomes a question of do we share why the stack needs
> >>>> to be involved? Maybe the program wants to collect stats to show traffic
> >>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> >>> Thanks for the explanation.
> >>>
> >>> Agree on the bpf able to collect stats will be useful.
> >>>
> >>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> >>> how may the old xdp_prog work/not-work?  As of now, the return value
> >>> is straight forward, FWD, PASS (to stack) or DROP (error).
> >>> With this change, the xdp_prog needs to match/switch() the
> >>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> >>
> >> IMO, programs should only call XDP_DROP for known reasons - like the 3
> >> above. Anything else punt to the stack.
> >>
> >> If a new RET_XYZ comes along:
> >> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> >> If the program does not understand XYZ and punts to the stack
> >> (recommendation), then a second lookup is done during normal packet
> >> processing and the stack drops it.
> >>
> >> 2. the new XYZ is a new path in the kernel that is unsupported with
> >> respect to XDP forwarding, nothing new for the program to do.
> >>
> >> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> >> the program writer.
> >>
> >> Worst case of punting packets to the stack for any rc != 0 means the
> >> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> >> in normal stack processing - to handle the packet.
> > Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> > should the bpf_*_fib_lookup() return value be kept as is such that
> > the xdp_prog is clear what to do.  The reason can be returned in
> > the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> > If the xdp_prog does not understand a reason, it still will not
> > affect its decision because the return value is clear.
> > I think the situation here is similar to regular syscall which usually
> > uses -1 to clearly states error and errno to spells out the reason.
> > 
> 
> I did consider returning the status in struct bpf_fib_lookup. However,
> it is 64 bytes and can not be extended without a big performance
> penalty, so the only option there is to make an existing entry a union
> the most logical of which is the ifindex. It seemed odd to me to have
> the result by hidden in the struct as a union on ifindex and returning
> the egress index from the function:
> 
> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
> 
> /* total length of packet from network header - used for MTU
> check */
> __u16   tot_len;
> -   __u32   ifindex;  /* L3 device index for lookup */
> +
> +   union {
> +   __u32   ifindex;  /* input: L3 device index for lookup */
> +   __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
> +   };
> 
> 
> It seemed more natural to have ifindex stay ifindex and only change
> value on return:
> 
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
> 
>   /* total length of packet from network header - used for MTU check */
>   __u16   tot_len;
> - __u32   ifindex;  /* L3 device index for lookup */
> +
> + /* input: L3 device index for lookup
> +  * output: nexthop device index from FIB lookup
> +  */
> + __u32   ifindex;
> 
>   union {
>   /* inputs to lookup */
> 
> 
> From a program's perspective:
> 
> rc < 0  -- program is passing incorrect data
> rc == 0 -- packet can be forwarded
> rc > 0  -- packet can not be forwarded.
> 
> BPF programs are not required to track the LKUP_RET values any more than
> a function retur

Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-19 Thread Martin KaFai Lau
On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>/* rc > 0 case */
> >>switch(rc) {
> >>case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>case BPF_FIB_LKUP_RET_PROHIBIT:
> >>return XDP_DROP;
> >>}
> >>
> >> For the others it becomes a question of do we share why the stack needs
> >> to be involved? Maybe the program wants to collect stats to show traffic
> >> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> > Thanks for the explanation.
> > 
> > Agree on the bpf able to collect stats will be useful.
> > 
> > I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> > how may the old xdp_prog work/not-work?  As of now, the return value
> > is straight forward, FWD, PASS (to stack) or DROP (error).
> > With this change, the xdp_prog needs to match/switch() the
> > BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> 
> IMO, programs should only call XDP_DROP for known reasons - like the 3
> above. Anything else punt to the stack.
> 
> If a new RET_XYZ comes along:
> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> If the program does not understand XYZ and punts to the stack
> (recommendation), then a second lookup is done during normal packet
> processing and the stack drops it.
> 
> 2. the new XYZ is a new path in the kernel that is unsupported with
> respect to XDP forwarding, nothing new for the program to do.
> 
> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> the program writer.
> 
> Worst case of punting packets to the stack for any rc != 0 means the
> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> in normal stack processing - to handle the packet.
Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
should the bpf_*_fib_lookup() return value be kept as is such that
the xdp_prog is clear what to do.  The reason can be returned in
the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
If the xdp_prog does not understand a reason, it still will not
affect its decision because the return value is clear.
I think the situation here is similar to regular syscall which usually
uses -1 to clearly states error and errno to spells out the reason.

> 
> > 
> >>
> >> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
> >>
> >>>> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
> >>>>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> >>>>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> >>>>  
> >>>> +enum {
> >>>> +BPF_FIB_LKUP_RET_SUCCESS,  /* lookup successful */
> >>>> +BPF_FIB_LKUP_RET_BLACKHOLE,/* dest is blackholed */
> >>>> +BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> >>>> +BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed */
> >>>> +BPF_FIB_LKUP_RET_NOT_FWDED,/* pkt is not forwardded */
> >>> BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> >>>
> >>
> >> Destination is local. More precisely, the FIB lookup is not unicast so
> >> not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
> >> RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
> >> called out.
> > I think it also includes the tbid not found case.
> 
> Another one of those "should never happen scenarios". The user does not
> specify the table; it is retrieved based on device association. Table
> defaults to the main table - which always exists - and any VRF
> enslavement of a device happens after the VRF device creates the table.
> 
> > 
> >>
> >>>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, 
> >>>> struct bpf_fib_lookup *params,
> >>>>  if (check_mtu) {
> >>>>  mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
> >>>>  if (params->tot_len > mtu)
> >>>> -return 0;
> >>>> +return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> >>>>  }
> >>>>  
> >>>>  if (f6i->fib6_nh.nh_lwtstate)
> >>>> -return 0;
> >>>&g

Re: [bpf PATCH v2 2/6] bpf: sockmap only allow ESTABLISHED sock state

2018-06-18 Thread Martin KaFai Lau
On Mon, Jun 18, 2018 at 07:50:19AM -0700, John Fastabend wrote:
> On 06/14/2018 05:18 PM, Martin KaFai Lau wrote:
> > On Thu, Jun 14, 2018 at 09:44:52AM -0700, John Fastabend wrote:
> >> Per the note in the TLS ULP (which is actually a generic statement
> >> regarding ULPs)
> >>
> >>  /* The TLS ulp is currently supported only for TCP sockets
> >>   * in ESTABLISHED state.
> >>   * Supporting sockets in LISTEN state will require us
> >>   * to modify the accept implementation to clone rather then
> >>   * share the ulp context.
> >>   */
> > Can you explain how that apply to bpf_tcp ulp?
> > 
> > My understanding is the "ulp context" referred in TLS ulp is
> > the tls_context stored in icsk_ulp_data but I don't see bpf_tcp's
> > ulp is using icsk_ulp_data.
> > 
> > Others LGTM.
> > 
> 
> So I think you are right we could probably allow it
> here but I am thinking I'll leave the check for now
> anyways for a couple reasons. First, we will shortly
> add support to allow ULP types to coexist. At the moment
> the two ULP types can not coexist. When this happens it
> looks like we will need to restrict to only ESTABLISHED
> types or somehow make all ULPs work in all states.
> 
> Second, I don't have any use cases (nor can I think of
> any) for the sock{map|hash} ULP to be running on a non
> ESTABLISHED socket. Its not clear to me that having the
> sendmsg/sendpage hooks for a LISTEN socket makes sense.
> I would rather restrict it now and if we add something
> later where it makes sense to run on non-ESTABLISHED
> socks we can remove the check.
Make sense if there is no use case.  It will be helpful if the commit log
is updated accordingly.  Thanks!

Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-18 Thread Martin KaFai Lau
On Mon, Jun 18, 2018 at 12:27:07PM -0600, David Ahern wrote:
> On 6/18/18 12:11 PM, Martin KaFai Lau wrote:
> > On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsah...@kernel.org wrote:
> >> From: David Ahern 
> >>
> >> For ACLs implemented using either FIB rules or FIB entries, the BPF
> >> program needs the FIB lookup status to be able to drop the packet.
> > Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
> > give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
> > on other BPF_FIB_LKUP_RET_*?
> > 
> 
>   rc = bpf_fib_lookup(ctx, _params, sizeof(fib_params), flags);
>   if (rc == 0)
>   packet is forwarded, do the redirect
> 
>   /* the program is misconfigured -- wrong parameters in struct or flags 
> */
>   if (rc < 0)
>   
> 
>   /* rc > 0 case */
>   switch(rc) {
>   case BPF_FIB_LKUP_RET_BLACKHOLE:
>   case BPF_FIB_LKUP_RET_UNREACHABLE:
>   case BPF_FIB_LKUP_RET_PROHIBIT:
>   return XDP_DROP;
>   }
> 
> For the others it becomes a question of do we share why the stack needs
> to be involved? Maybe the program wants to collect stats to show traffic
> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
Thanks for the explanation.

Agree on the bpf able to collect stats will be useful.

I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
how may the old xdp_prog work/not-work?  As of now, the return value
is straight forward, FWD, PASS (to stack) or DROP (error).
With this change, the xdp_prog needs to match/switch() the
BPF_FIB_LKUP_RET_* to at least PASS and DROP.

> 
> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
> 
> >> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
> >>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> >>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> >>  
> >> +enum {
> >> +  BPF_FIB_LKUP_RET_SUCCESS,  /* lookup successful */
> >> +  BPF_FIB_LKUP_RET_BLACKHOLE,/* dest is blackholed */
> >> +  BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> >> +  BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed */
> >> +  BPF_FIB_LKUP_RET_NOT_FWDED,/* pkt is not forwardded */
> > BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> > 
> 
> Destination is local. More precisely, the FIB lookup is not unicast so
> not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
> RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
> called out.
I think it also includes the tbid not found case.

> 
> >> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, 
> >> struct bpf_fib_lookup *params,
> >>if (check_mtu) {
> >>mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
> >>if (params->tot_len > mtu)
> >> -  return 0;
> >> +  return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> >>}
> >>  
> >>if (f6i->fib6_nh.nh_lwtstate)
> >> -  return 0;
> >> +  return BPF_FIB_LKUP_RET_UNSUPP_LWT;
> >>  
> >>if (f6i->fib6_flags & RTF_GATEWAY)
> >>*dst = f6i->fib6_nh.nh_gw;
> >>  
> >>dev = f6i->fib6_nh.nh_dev;
> >> +  if (unlikely(!dev))
> >> +  return BPF_FIB_LKUP_RET_NO_NHDEV;
> > Is this a bug fix?
> > 
> 
> Difference between IPv4 and IPv6. Making them consistent.
> 
> It is a major BUG in the kernel to reach this point in either protocol
> to have a unicast route not tied to a device. IPv4 has checks; v6 does
> not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
> as close to the same as possible.
Make sense.  A comment in the commit log will be useful if there is a
re-spin.


Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status

2018-06-18 Thread Martin KaFai Lau
On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsah...@kernel.org wrote:
> From: David Ahern 
> 
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
on other BPF_FIB_LKUP_RET_*?

> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
> 
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
> 
> Update the sample program per the change in API.
> 
> Signed-off-by: David Ahern 
> ---
>  include/uapi/linux/bpf.h   | 28 ++
>  net/core/filter.c  | 74 
> --
>  samples/bpf/xdp_fwd_kern.c |  8 ++---
>  3 files changed, 78 insertions(+), 32 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..ceb80071c341 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
>   *   is resolved), the nexthop address is returned in ipv4_dst
>   *   or ipv6_dst based on family, smac is set to mac address of
>   *   egress device, dmac is set to nexthop mac address, rt_metric
> - *   is set to metric from route (IPv4/IPv6 only).
> + *   is set to metric from route (IPv4/IPv6 only), and ifindex
> + *   is set to the device index of the nexthop from the FIB lookup.
>   *
>   * *plen* argument is the size of the passed in struct.
>   * *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,9 @@ union bpf_attr {
>   * *ctx* is either **struct xdp_md** for XDP programs or
>   * **struct sk_buff** tc cls_act programs.
>   * Return
> - * Egress device index on success, 0 if packet needs to continue
> - * up the stack for further processing or a negative error in 
> case
> - * of failure.
> + *   < 0 if any input argument is invalid
> + * 0 on success (packet is forwarded and nexthop neighbor exists)
> + *   > 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response
>   *
>   * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map 
> *map, void *key, u64 flags)
>   *   Description
> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>  
> +enum {
> + BPF_FIB_LKUP_RET_SUCCESS,  /* lookup successful */
> + BPF_FIB_LKUP_RET_BLACKHOLE,/* dest is blackholed */
> + BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed */
> + BPF_FIB_LKUP_RET_NOT_FWDED,/* pkt is not forwardded */
BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?

> + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
> + BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires unsupported encap */
> + BPF_FIB_LKUP_RET_NO_NHDEV, /* nh device does not exist */
> + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neigh entry for nh */
> + BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* pkt too big to fwd */
> +};
> +
>  struct bpf_fib_lookup {
>   /* input:  network family for lookup (AF_INET, AF_INET6)
>* output: network family of egress nexthop
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>  
>   /* total length of packet from network header - used for MTU check */
>   __u16   tot_len;
> - __u32   ifindex;  /* L3 device index for lookup */
> +
> + /* input: L3 device index for lookup
> +  * output: nexthop device index from FIB lookup
> +  */
> + __u32   ifindex;
>  
>   union {
>   /* inputs to lookup */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e7f12e9f598c..e758ca487878 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup 
> *params,
>   memcpy(params->smac, dev->dev_addr, ETH_ALEN);
>   params->h_vlan_TCI = 0;
>   params->h_vlan_proto = 0;
> + params->ifindex = dev->ifindex;
>  
> - return dev->ifindex;
> + return 0;
>  }
>  #endif
>  
> @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct 
> bpf_fib_lookup *params,
>   /* verify forwarding is enabled on this interface */
>   in_dev = __in_dev_get_rcu(dev);
>   if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> - return 0;
> + return BPF_FIB_LKUP_RET_FWD_DISABLED;
>  
>   if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>   fl4.flowi4_iif = 1;
> @@ -4123,7 +4124,7 @@ static int 

Re: [PATCH bpf 2/2] bpf: reject any prog that failed read-only lock

2018-06-15 Thread Martin KaFai Lau
On Fri, Jun 15, 2018 at 02:30:48AM +0200, Daniel Borkmann wrote:
> We currently lock any JITed image as read-only via bpf_jit_binary_lock_ro()
> as well as the BPF image as read-only through bpf_prog_lock_ro(). In
> the case any of these would fail we throw a WARN_ON_ONCE() in order to
> yell loudly to the log. Perhaps, to some extend, this may be comparable
> to an allocation where __GFP_NOWARN is explicitly not set.
> 
> Added via 65869a47f348 ("bpf: improve read-only handling"), this behavior
> is slightly different compared to any of the other in-kernel set_memory_ro()
> users who do not check the return code of set_memory_ro() and friends /at
> all/ (e.g. in the case of module_enable_ro() / module_disable_ro()). Given
> in BPF this is mandatory hardening step, we want to know whether there
> are any issues that would leave both BPF data writable. So it happens
> that syzkaller enabled fault injection and it triggered memory allocation
> failure deep inside x86's change_page_attr_set_clr() which was triggered
> from set_memory_ro().
> 
> Now, there are two options: i) leaving everything as is, and ii) reworking
> the image locking code in order to have a final checkpoint out of the
> central bpf_prog_select_runtime() which probes whether any of the calls
> during prog setup weren't successful, and then bailing out with an error.
> Option ii) is a better approach since this additional paranoia avoids
> altogether leaving any potential W+X pages from BPF side in the system.
> Therefore, lets be strict about it, and reject programs in such unlikely
> occasion. While testing I noticed also that one bpf_prog_lock_ro()
> call was missing on the outer dummy prog in case of calls, e.g. in the
> destructor we call bpf_prog_free_deferred() on the main prog where we
> try to bpf_prog_unlock_free() the program, and since we go via
> bpf_prog_select_runtime() do that as well.
> 
> Reported-by: syzbot+3b889862e65a98317...@syzkaller.appspotmail.com
> Reported-by: syzbot+9e762b52dd17e616a...@syzkaller.appspotmail.com
> Signed-off-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf 1/2] bpf: fix panic in prog load calls cleanup

2018-06-15 Thread Martin KaFai Lau
On Fri, Jun 15, 2018 at 02:30:47AM +0200, Daniel Borkmann wrote:
> While testing I found that when hitting error path in bpf_prog_load()
> where we jump to free_used_maps and prog contained BPF to BPF calls
> that were JITed earlier, then we never clean up the bpf_prog_kallsyms_add()
> done under jit_subprogs(). Add proper API to make BPF kallsyms deletion
> more clear and fix that.
> 
> Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs")
> Signed-off-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 

> ---
>  include/linux/filter.h |  3 +++
>  kernel/bpf/core.c  | 14 ++
>  kernel/bpf/syscall.c   |  8 ++--
>  3 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 45fc0f5..297c56f 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -961,6 +961,9 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog 
> *fp)
>  }
>  #endif /* CONFIG_BPF_JIT */
>  
> +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp);
> +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);
> +
>  #define BPF_ANC  BIT(15)
>  
>  static inline bool bpf_needs_clear_a(const struct sock_filter *first)
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 9f14937..1061968 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -350,6 +350,20 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog 
> *prog, u32 off,
>   return prog_adj;
>  }
>  
> +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
> +{
> + int i;
> +
> + for (i = 0; i < fp->aux->func_cnt; i++)
> + bpf_prog_kallsyms_del(fp->aux->func[i]);
> +}
> +
> +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
> +{
> + bpf_prog_kallsyms_del_subprogs(fp);
> + bpf_prog_kallsyms_del(fp);
> +}
> +
>  #ifdef CONFIG_BPF_JIT
>  /* All BPF JIT sysctl knobs here. */
>  int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0fa2062..0f62692 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1034,14 +1034,9 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>  static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>  {
>   if (atomic_dec_and_test(>aux->refcnt)) {
> - int i;
> -
>   /* bpf_prog_free_id() must be called first */
>   bpf_prog_free_id(prog, do_idr_lock);
> -
> - for (i = 0; i < prog->aux->func_cnt; i++)
> - bpf_prog_kallsyms_del(prog->aux->func[i]);
> - bpf_prog_kallsyms_del(prog);
> + bpf_prog_kallsyms_del_all(prog);
>  
>   call_rcu(>aux->rcu, __bpf_prog_put_rcu);
>   }
> @@ -1384,6 +1379,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>   return err;
>  
>  free_used_maps:
> + bpf_prog_kallsyms_del_subprogs(prog);
>   free_used_maps(prog->aux);
>  free_prog:
>   bpf_prog_uncharge_memlock(prog);
> -- 
> 2.9.5
> 


Re: [bpf PATCH v2 3/6] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-15 Thread Martin KaFai Lau
On Fri, Jun 15, 2018 at 08:23:14AM -0700, John Fastabend wrote:
> On 06/14/2018 10:41 PM, Martin KaFai Lau wrote:
> > On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
> >> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> >> only needed for protecting smap_release_sock() the ingress and cork
> >> lists are protected by sock lock. Having the lock in wider scope is
> >> harmless but may confuse the reader who may infer it is in fact
> >> needed.
> >>
> >> Next, in sock_hash_delete_elem() the pattern is as follows,
> >>
> >>   sock_hash_delete_elem()
> >>  [...]
> >>  spin_lock(bucket_lock)
> >>  l = lookup_elem_raw()
> >>  if (l)
> >> hlist_del_rcu()
> >> write_lock(sk_callback_lock)
> >>   destroy psock ...
> >> write_unlock(sk_callback_lock)
> >>  spin_unlock(bucket_lock)
> >>
> >> The ordering is necessary because we only know the {p}sock after
> >> dereferencing the hash table which we can't do unless we have the
> >> bucket lock held. Once we have the bucket lock and the psock element
> >> it is deleted from the hashmap to ensure any other path doing a lookup
> >> will fail. Finally, the refcnt is decremented and if zero the psock
> >> is destroyed.
> >>
> >> In parallel with the above (or free'ing the map) a tcp close event
> >> may trigger tcp_close(). Which at the moment omits the bucket lock
> >> altogether (oops!) where the flow looks like this,
> >>
> >>   bpf_tcp_close()
> >>  [...]
> >>  write_lock(sk_callback_lock)
> >>  for each psock->maps // list of maps this sock is part of
> >>  hlist_del_rcu(ref_hash_node);
> >>   destroy psock ...
> >>  write_unlock(sk_callback_lock)
> >>
> >> Obviously, and demonstrated by syzbot, this is broken because
> >> we can have multiple threads deleting entries via hlist_del_rcu().
> >>
> >> To fix this we might be tempted to wrap the hlist operation in a
> >> bucket lock but that would create a lock inversion problem. In
> >> summary to follow locking rules maps needs the sk_callback_lock but we
> >> need the bucket lock to do the hlist_del_rcu. To resolve the lock
> >> inversion problem note that when bpf_tcp_close is called no updates
> >> can happen in parallel, due to ESTABLISH state check in update logic,
> >> so pop the head of the list repeatedly and remove the reference until
> >> no more are left. If a delete happens in parallel from the BPF API
> >> that is OK as well because it will do a similar action, lookup the
> >> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
> >> We check for this case before doing a destroy on the psock to ensure
> >> we don't have two threads tearing down a psock. The new logic is
> >> as follows,
> >>
> >>   bpf_tcp_close()
> >>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
> >>   bucket_lock() // lock hash list bucket
> >>   l = lookup_elem_raw(head, hash, key, key_size);
> >>   if (l) {
> >>  //only get here if elmnt was not already removed
> >>  hlist_del_rcu()
> >>  ... destroy psock...
> >>   }
> >>   bucket_unlock()
> >>
> >> And finally for all the above to work add missing sk_callback_lock
> >> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> >> delete and update may corrupt maps list.
> >>
> >> (As an aside the sk_callback_lock serves two purposes. The
> >>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
> >>  etc. The second is to protect the psock 'maps' list. The 'maps' list
> >>  is used to (as shown above) to delete all map/hash references to a
> >>  sock when the sock is closed)
> >>
> >> (If we did not have the ESTABLISHED state guarantee from tcp_close
> >>  then we could not ensure completion because updates could happen
> >>  forever and pin thread in delete loop.)
> >>
> >> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> >> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> >> Signed-off-by: John Fastabend 
> >> ---
> >>  0 files changed
> >>
> 
>  Will fix this 0 files changes as well.
> 
> >>struct bpf_htab *htab = container_of(map, struct bp

Re: [bpf PATCH v2 6/6] bpf: selftest remove attempts to add LISTEN sockets to sockmap

2018-06-15 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:45:12AM -0700, John Fastabend wrote:
> In selftest test_maps the sockmap test case attempts to add a socket
> in listening state to the sockmap. This is no longer a valid operation
> so it fails as expected. However, the test wrongly reports this as an
> error now. Fix the test to avoid adding sockets in listening state.
> 
> Signed-off-by: John Fastabend 
Acked-by: Martin KaFai Lau 

> ---
>  0 files changed
> 
> diff --git a/tools/testing/selftests/bpf/test_maps.c 
> b/tools/testing/selftests/bpf/test_maps.c
> index 6c25334..9fed5f0 100644
> --- a/tools/testing/selftests/bpf/test_maps.c
> +++ b/tools/testing/selftests/bpf/test_maps.c
> @@ -564,7 +564,7 @@ static void test_sockmap(int tasks, void *data)
>   }
>  
>   /* Test update without programs */
> - for (i = 0; i < 6; i++) {
> + for (i = 2; i < 6; i++) {
>   err = bpf_map_update_elem(fd, , [i], BPF_ANY);
>   if (err) {
>   printf("Failed noprog update sockmap '%i:%i'\n",
> @@ -727,7 +727,7 @@ static void test_sockmap(int tasks, void *data)
>   }
>  
>   /* Test map update elem afterwards fd lives in fd and map_fd */
> - for (i = 0; i < 6; i++) {
> + for (i = 2; i < 6; i++) {
>   err = bpf_map_update_elem(map_fd_rx, , [i], BPF_ANY);
>   if (err) {
>   printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
> 


Re: [bpf PATCH v2 5/6] bpf: sockhash, add release routine

2018-06-15 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:45:07AM -0700, John Fastabend wrote:
> Add map_release_uref pointer to hashmap ops. This was dropped when
> original sockhash code was ported into bpf-next before initial
> commit.
> 
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
Acked-by: Martin KaFai Lau 

> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index ffc5152..77fe204 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -2518,6 +2518,7 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
> *map, void *key)
>   .map_get_next_key = sock_hash_get_next_key,
>   .map_update_elem = sock_hash_update_elem,
>   .map_delete_elem = sock_hash_delete_elem,
> + .map_release_uref = sock_map_release,
>  };
>  
>  static bool bpf_is_valid_sock(struct bpf_sock_ops_kern *ops)
> 


Re: [bpf PATCH v2 4/6] bpf: sockmap, tcp_disconnect to listen transition

2018-06-15 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:45:02AM -0700, John Fastabend wrote:
> After adding checks to ensure TCP is in ESTABLISHED state when a
> sock is added we need to also ensure that user does not transition
> through tcp_disconnect() and back into ESTABLISHED state without
> sockmap removing the sock.
> 
> To do this add unhash hook and remove sock from map there.
> 
> Reported-by: Eric Dumazet 
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
LGTM. One nit.

Acked-by: Martin KaFai Lau 

> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 04764f5..ffc5152 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -130,6 +130,7 @@ struct smap_psock {
>  
>   struct proto *sk_proto;
>   void (*save_close)(struct sock *sk, long timeout);
> + void (*save_unhash)(struct sock *sk);
>   void (*save_data_ready)(struct sock *sk);
>   void (*save_write_space)(struct sock *sk);
>  };
> @@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>   int offset, size_t size, int flags);
>  static void bpf_tcp_close(struct sock *sk, long timeout);
> +static void bpf_tcp_unhash(struct sock *sk);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -182,6 +184,7 @@ static void build_protos(struct proto 
> prot[SOCKMAP_NUM_CONFIGS],
>  {
>   prot[SOCKMAP_BASE]  = *base;
>   prot[SOCKMAP_BASE].close= bpf_tcp_close;
> + prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
>   prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
>   prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
>  
> @@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk)
>   }
>  
>   psock->save_close = sk->sk_prot->close;
> + psock->save_unhash = sk->sk_prot->unhash;
>   psock->sk_proto = sk->sk_prot;
>  
>   /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> @@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock 
> *sk,
>   return e;
>  }
>  
> -static void bpf_tcp_close(struct sock *sk, long timeout)
> +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
>  {
> - void (*close_fun)(struct sock *sk, long timeout);
>   struct smap_psock_map_entry *e;
>   struct sk_msg_buff *md, *mtmp;
> - struct smap_psock *psock;
>   struct sock *osk;
>  
> - rcu_read_lock();
> - psock = smap_psock_sk(sk);
> - if (unlikely(!psock)) {
> - rcu_read_unlock();
> - return sk->sk_prot->close(sk, timeout);
> - }
> -
> - /* The psock may be destroyed anytime after exiting the RCU critial
> -  * section so by the time we use close_fun the psock may no longer
> -  * be valid. However, bpf_tcp_close is called with the sock lock
> -  * held so the close hook and sk are still valid.
> -  */
> - close_fun = psock->save_close;
> -
>   if (psock->cork) {
>   free_start_sg(psock->sock, psock->cork);
>   kfree(psock->cork);
> @@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>   }
>   e = psock_map_pop(sk, psock);
>   }
> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> + void (*unhash_fun)(struct sock *sk);
> + struct smap_psock *psock;
> +
> + rcu_read_lock();
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock)) {
> + rcu_read_unlock();
> + return sk->sk_prot->unhash(sk);
> + }
> +
> + /* The psock may be destroyed anytime after exiting the RCU critial
> +  * section so by the time we use close_fun the psock may no longer
> +  * be valid. However, bpf_tcp_close is called with the sock lock
> +  * held so the close hook and sk are still valid.
> +  */
Nit. s/close/unhash/

> + unhash_fun = psock->save_unhash;
> + bpf_tcp_remove(sk, psock);
> + rcu_read_unlock();
> + unhash_fun(sk);
> +
> +}
> +
> +static void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> + void (*close_fun)(struct sock *sk, long timeout);
> + struct smap_psock *psock;
> +
> + rcu_read_lock();
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock)) {
> + rcu_read_unlock();
> + return sk->sk_prot->close(sk, timeout);
> + }
> +
> + /* The psock may be destroyed anytime after exiting the RCU critial
> +  * section so by the time we use close_fun the psock may no longer
> +  * be valid. However, bpf_tcp_close is called with the sock lock
> +  * held so the close hook and sk are still valid.
> +  */
> + close_fun = psock->save_close;
> + bpf_tcp_remove(sk, psock);
>   rcu_read_unlock();
>   close_fun(sk, timeout);
>  }
> 


Re: [bpf PATCH v2 3/6] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-14 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting smap_release_sock() the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>  [...]
>  spin_lock(bucket_lock)
>  l = lookup_elem_raw()
>  if (l)
> hlist_del_rcu()
> write_lock(sk_callback_lock)
>   destroy psock ...
> write_unlock(sk_callback_lock)
>  spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>  [...]
>  write_lock(sk_callback_lock)
>  for each psock->maps // list of maps this sock is part of
>  hlist_del_rcu(ref_hash_node);
>   destroy psock ...
>  write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules maps needs the sk_callback_lock but we
> need the bucket lock to do the hlist_del_rcu. To resolve the lock
> inversion problem note that when bpf_tcp_close is called no updates
> can happen in parallel, due to ESTABLISH state check in update logic,
> so pop the head of the list repeatedly and remove the reference until
> no more are left. If a delete happens in parallel from the BPF API
> that is OK as well because it will do a similar action, lookup the
> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
> We check for this case before doing a destroy on the psock to ensure
> we don't have two threads tearing down a psock. The new logic is
> as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>  //only get here if elmnt was not already removed
>  hlist_del_rcu()
>  ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list.
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> (If we did not have the ESTABLISHED state guarantee from tcp_close
>  then we could not ensure completion because updates could happen
>  forever and pin thread in delete loop.)
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index f1ab52d..04764f5 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -258,16 +258,54 @@ static void bpf_tcp_release(struct sock *sk)
>   rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +  u32 hash, void *key, u32 key_size)
> +{
> + struct htab_elem *l;
> +
> + hlist_for_each_entry_rcu(l, head, hash_node) {
> + if (l->hash == hash && !memcmp(>key, key, key_size))
> + return l;
> + }
> +
> + return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> + return >buckets[hash & (htab->n_buckets - 1)];
> +}
> +
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 
> hash)
> +{
> + return &__select_bucket(htab, hash)->head;
> +}
> +
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  {
>   atomic_dec(>count);
>   kfree_rcu(l, rcu);
>  }
>  
> +struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
> +struct smap_psock *psock)
> +{
> + struct smap_psock_map_entry *e;
> +
> + 

Re: [bpf PATCH v2 2/6] bpf: sockmap only allow ESTABLISHED sock state

2018-06-14 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:44:52AM -0700, John Fastabend wrote:
> Per the note in the TLS ULP (which is actually a generic statement
> regarding ULPs)
> 
>  /* The TLS ulp is currently supported only for TCP sockets
>   * in ESTABLISHED state.
>   * Supporting sockets in LISTEN state will require us
>   * to modify the accept implementation to clone rather then
>   * share the ulp context.
>   */
Can you explain how that apply to bpf_tcp ulp?

My understanding is the "ulp context" referred in TLS ulp is
the tls_context stored in icsk_ulp_data but I don't see bpf_tcp's
ulp is using icsk_ulp_data.

Others LGTM.

> 
> After this patch we only allow socks that are in ESTABLISHED state or
> are being added via a sock_ops event that is transitioning into an
> ESTABLISHED state. By allowing sock_ops events we allow users to
> manage sockmaps directly from sock ops programs. The two supported
> sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
> BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.
> 
> >From the userspace BPF update API the sock lock is also taken now
> to ensure we don't race with state changes after the ESTABLISHED
> check. The BPF program sock ops hook already has the sock lock
> taken.
> 
> Also tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
> 'netperf -H [IPv4]'.
> 
> Reported-by: Eric Dumazet 
> Signed-off-by: John Fastabend 
> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index f6dd4cd..f1ab52d 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -1976,13 +1976,20 @@ static int sock_map_update_elem(struct bpf_map *map,
>   return -EINVAL;
>   }
>  
> + lock_sock(skops.sk);
> + /* ULPs are currently supported only for TCP sockets in ESTABLISHED
> +  * state.
> +  */
>   if (skops.sk->sk_type != SOCK_STREAM ||
> - skops.sk->sk_protocol != IPPROTO_TCP) {
> - fput(socket->file);
> - return -EOPNOTSUPP;
> + skops.sk->sk_protocol != IPPROTO_TCP ||
> + skops.sk->sk_state != TCP_ESTABLISHED) {
> + err = -EOPNOTSUPP;
> + goto out;
>   }
>  
>   err = sock_map_ctx_update_elem(, map, key, flags);
> +out:
> + release_sock(skops.sk);
>   fput(socket->file);
>   return err;
>  }
> @@ -2247,10 +2254,6 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>  
>   sock = skops->sk;
>  
> - if (sock->sk_type != SOCK_STREAM ||
> - sock->sk_protocol != IPPROTO_TCP)
> - return -EOPNOTSUPP;
> -
>   if (unlikely(map_flags > BPF_EXIST))
>   return -EINVAL;
>  
> @@ -2338,7 +2341,20 @@ static int sock_hash_update_elem(struct bpf_map *map,
>   return -EINVAL;
>   }
>  
> + lock_sock(skops.sk);
> + /* ULPs are currently supported only for TCP sockets in ESTABLISHED
> +  * state.
> +  */
> + if (skops.sk->sk_type != SOCK_STREAM ||
> + skops.sk->sk_protocol != IPPROTO_TCP ||
> + skops.sk->sk_state != TCP_ESTABLISHED) {
> + err = -EOPNOTSUPP;
> + goto out;
> + }
> +
>   err = sock_hash_ctx_update_elem(, map, key, flags);
> +out:
> + release_sock(skops.sk);
>   fput(socket->file);
>   return err;
>  }
> @@ -2423,10 +2439,19 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
> *map, void *key)
>   .map_delete_elem = sock_hash_delete_elem,
>  };
>  
> +static bool bpf_is_valid_sock(struct bpf_sock_ops_kern *ops)
> +{
> + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
> +ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
> +}
> +
>  BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
>  struct bpf_map *, map, void *, key, u64, flags)
>  {
>   WARN_ON_ONCE(!rcu_read_lock_held());
> +
> + if (!bpf_is_valid_sock(bpf_sock))
> + return -EOPNOTSUPP;
>   return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
>  }
>  
> @@ -2445,6 +2470,9 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
> *map, void *key)
>  struct bpf_map *, map, void *, key, u64, flags)
>  {
>   WARN_ON_ONCE(!rcu_read_lock_held());
> +
> + if (!bpf_is_valid_sock(bpf_sock))
> + return -EOPNOTSUPP;
>   return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
>  }
>  
> 


Re: [bpf PATCH v2 1/6] bpf: sockmap, fix crash when ipv6 sock is added

2018-06-14 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 09:44:46AM -0700, John Fastabend wrote:
> This fixes a crash where we assign tcp_prot to IPv6 sockets instead
> of tcpv6_prot.
> 
> Previously we overwrote the sk->prot field with tcp_prot even in the
> AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot
> are used.

> Further, only allow ESTABLISHED connections to join the
> map per note in TLS ULP,
> 
>/* The TLS ulp is currently supported only for TCP sockets
> * in ESTABLISHED state.
> * Supporting sockets in LISTEN state will require us
> * to modify the accept implementation to clone rather then
> * share the ulp context.
> */
This bit has been moved to patch 2.

> 
> Also tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
> 'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously
> crashing case here.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Reported-by: syzbot+5c063698bdbfac19f...@syzkaller.appspotmail.com
> Signed-off-by: John Fastabend 
> Signed-off-by: Wei Wang 
> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 52a91d8..f6dd4cd 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>   int offset, size_t size, int flags);
> +static void bpf_tcp_close(struct sock *sk, long timeout);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -161,7 +162,42 @@ static bool bpf_tcp_stream_read(const struct sock *sk)
>   return !empty;
>  }
>  
> -static struct proto tcp_bpf_proto;
> +enum {
> + SOCKMAP_IPV4,
> + SOCKMAP_IPV6,
> + SOCKMAP_NUM_PROTS,
> +};
> +
> +enum {
> + SOCKMAP_BASE,
> + SOCKMAP_TX,
> + SOCKMAP_NUM_CONFIGS,
> +};
> +
> +static struct proto *saved_tcpv6_prot;
__read_mostly

> +static DEFINE_MUTEX(tcpv6_prot_mutex);
> +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
> +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
> +  struct proto *base)
> +{
> + prot[SOCKMAP_BASE]  = *base;
> + prot[SOCKMAP_BASE].close= bpf_tcp_close;
> + prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
> + prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
> +
> + prot[SOCKMAP_TX]= prot[SOCKMAP_BASE];
> + prot[SOCKMAP_TX].sendmsg= bpf_tcp_sendmsg;
> + prot[SOCKMAP_TX].sendpage   = bpf_tcp_sendpage;
> +}
> +
> +static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
> +{
> + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
> + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
> +
> + sk->sk_prot = _tcp_prots[family][conf];
> +}
> +
>  static int bpf_tcp_init(struct sock *sk)
>  {
>   struct smap_psock *psock;
> @@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk)
>   psock->save_close = sk->sk_prot->close;
>   psock->sk_proto = sk->sk_prot;
>  
> - if (psock->bpf_tx_msg) {
> - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
> - tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
> - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
> - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
> + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> + if (sk->sk_family == AF_INET6 &&
> + unlikely(sk->sk_prot != smp_load_acquire(_tcpv6_prot))) {
> + mutex_lock(_prot_mutex);
bpf_tcp_init() can be called by skops?
Can mutex_lock() be used here?

> + if (likely(sk->sk_prot != saved_tcpv6_prot)) {
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
> + smp_store_release(_tcpv6_prot, sk->sk_prot);
> + }
> + mutex_unlock(_prot_mutex);
>   }
> -
> - sk->sk_prot = _bpf_proto;
> + update_sk_prot(sk, psock);
>   rcu_read_unlock();
>   return 0;
>  }
> @@ -,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
>  
>  static int bpf_tcp_ulp_register(void)
>  {
> - tcp_bpf_proto = tcp_prot;
> - tcp_bpf_proto.close = bpf_tcp_close;
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], _prot);
>   /* Once BPF TX ULP is registered it is never unregistered. It
>* will be in the ULP list for the lifetime of the system. Doing
>* duplicate registers is not a problem.
> 


Re: [PATCH bpf-net] selftests/bpf: delete xfrm tunnel when test exits.

2018-06-14 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 05:01:06AM -0700, William Tu wrote:
> Make the printting of bpf xfrm tunnel better and
> cleanup xfrm state and policy when xfrm test finishes.
LGTM.  The subject tag actually meant s/bpf-net/bpf-next/?

It makes sense to be in bpf-next but I think bpf-next is still closed.
Please repost later.

> 
> Signed-off-by: William Tu 
> ---
>  tools/testing/selftests/bpf/test_tunnel.sh | 24 +---
>  1 file changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/test_tunnel.sh 
> b/tools/testing/selftests/bpf/test_tunnel.sh
> index aeb2901f21f4..7b1946b340be 100755
> --- a/tools/testing/selftests/bpf/test_tunnel.sh
> +++ b/tools/testing/selftests/bpf/test_tunnel.sh
> @@ -608,28 +608,26 @@ setup_xfrm_tunnel()
>  test_xfrm_tunnel()
>  {
>   config_device
> -#tcpdump -nei veth1 ip &
> - output=$(mktemp)
> - cat /sys/kernel/debug/tracing/trace_pipe | tee $output &
> -setup_xfrm_tunnel
> + > /sys/kernel/debug/tracing/trace
> + setup_xfrm_tunnel
>   tc qdisc add dev veth1 clsact
>   tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \
>   sec xfrm_get_state
>   ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
>   sleep 1
> - grep "reqid 1" $output
> + grep "reqid 1" /sys/kernel/debug/tracing/trace
>   check_err $?
> - grep "spi 0x1" $output
> + grep "spi 0x1" /sys/kernel/debug/tracing/trace
>   check_err $?
> - grep "remote ip 0xac100164" $output
> + grep "remote ip 0xac100164" /sys/kernel/debug/tracing/trace
>   check_err $?
>   cleanup
>  
>   if [ $ret -ne 0 ]; then
> -echo -e ${RED}"FAIL: xfrm tunnel"${NC}
> -return 1
> -fi
> -echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
> + echo -e ${RED}"FAIL: xfrm tunnel"${NC}
> + return 1
> + fi
> + echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
>  }
>  
>  attach_bpf()
> @@ -657,6 +655,10 @@ cleanup()
>   ip link del ip6geneve11 2> /dev/null
>   ip link del erspan11 2> /dev/null
>   ip link del ip6erspan11 2> /dev/null
> + ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> 
> /dev/null
> + ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> 
> /dev/null
> + ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 
> 0x1 2> /dev/null
> + ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 
> 0x2 2> /dev/null
>  }
>  
>  cleanup_exit()
> -- 
> 2.7.4
> 


Re: [PATCH bpf-next v5 00/10] BTF: BPF Type Format

2018-06-14 Thread Martin KaFai Lau
On Thu, Jun 14, 2018 at 12:03:34PM -0300, Arnaldo Carvalho de Melo wrote:

> > > > > 1. The tools/testing/selftests/bpf/Makefile has the CLANG_FLAGS and
> > > > >LLC_FLAGS needed to compile the bpf prog.  It requires a new
> > > > >"-mattr=dwarf" llc option which was added to the future
> > > > >llvm 7.0.

[ ... ]

> I tried it, but it didn't work, see:
> 
> [root@jouet bpf]# cat hello.c 
> #include "stdio.h"
> 
> int syscall_enter(openat)(void *ctx)
> {
>   puts("Hello, world\n");
>   return 0;
> }
> [root@jouet bpf]# trace -e openat,hello.c touch /tmp/kafai
> clang-6.0: error: unknown argument: '-mattr=dwarf'
"-mattr=dwarf" is currently a llc only option.

tools/testing/selftests/bpf/Makefile has example on how to pipe clang to llc.

e.g.:
clang -g -O2 -target bpf -emit-llvm -c hello.c -o - | llc -march=bpf 
-mcpu=generic -mattr=dwarfris -filetype=obj -o hello.o

> ERROR:unable to compile hello.c
> Hint: Check error message shown above.
> Hint: You can also pre-compile it into .o using:
>   clang -target bpf -O2 -c hello.c
>   with proper -I and -D options.
> event syntax error: 'hello.c'
>  \___ Failed to load hello.c from source: Error when 
> compiling BPF scriptlet
> 
> (add -v to see detail)
> Run 'perf list' for a list of valid events
> 
>  Usage: perf trace [] []
> or: perf trace [] --  []
> or: perf trace record [] []
> or: perf trace record [] --  []
> 
> -e, --eventevent/syscall selector. use 'perf list' to list 
> available events
> [root@jouet bpf]#
> 
> The full command line with that is:
> 
> [root@jouet bpf]# trace -v -e openat,hello.c touch /tmp/kafai |& grep mattr
> set env: CLANG_OPTIONS=-g -mattr=dwarf
> llvm compiling command : /usr/local/bin/clang -D__KERNEL__ -D__NR_CPUS__=4 
> -DLINUX_VERSION_CODE=0x41100 -g -mattr=dwarf  -nostdinc -isystem 
> /usr/lib/gcc/x86_64-redhat-linux/7/include 
> -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  
> -I/home/acme/git/linux/include -I./include 
> -I/home/acme/git/linux/arch/x86/include/uapi 
> -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi 
> -I./include/generated/uapi -include 
> /home/acme/git/linux/include/linux/kconfig.h  
> -I/home/acme/lib/include/perf/bpf -Wno-unused-value -Wno-pointer-sign 
> -working-directory /lib/modules/4.17.0-rc5/build -c /home/acme/bpf/hello.c 
> -target bpf -O2 -o -
> clang-6.0: error: unknown argument: '-mattr=dwarf'
> [root@jouet bpf]#
> 
> This is with these llvm and clang trees:
> 
> [root@jouet llvm]# git log --oneline -5
> 98c78e82f54 (HEAD -> master, origin/master, origin/HEAD) [asan] Instrument 
> comdat globals on COFF targets
> 6ad988b5998 [DAGCombiner] clean up comments; NFC
> a735ba5b795 [X86][SSE] Support v8i16/v16i16 rotations
> 1503b9f6fe8 [x86] add tests for node-level FMF; NFC
> 4a49826736f [x86] regenerate test checks; NFC
> [root@jouet llvm]#
> 
> [root@jouet llvm]# cd tools/clang/
> [root@jouet clang]# git log --oneline -5
> 8c873daccc (HEAD -> master, origin/master, origin/HEAD) [X86] Add builtins 
> for vpermq/vpermpd instructions to enable target feature checking.
> a344be6ba4 [X86] Change immediate type for some builtins from char to int.
> dcdd53793e [CUDA] Fix emission of constant strings in sections
> a90c85acaf [X86] Add builtins for shufps and shufpd to enable target feature 
> and immediate range checking.
> ff71c0eccc [X86] Add builtins for pshufd, pshuflw, and pshufhw to enable 
> target feature and immediate range checking.
> [root@jouet clang]#
> 
> [root@jouet clang]# git log | grep mattr=dwarf
> [root@jouet clang]# cd -
> /home/acme/git.tmp/git/llvm
> [root@jouet llvm]# git log | grep mattr=dwarf
> bpf: introduce -mattr=dwarfris to disable 
> DwarfUsesRelocationsAcrossSections
> This patch introduces a new flag -mattr=dwarfris
> [root@jouet llvm]#
> 
> Humm, so its -mattr=dwarfris and not -attr=dwarf?
> 
> Didn't help :-\
> 
> commit 0e0047f8c9ada2f0fe0c5f01579a80e2455b8df5
> Author: Yonghong Song 
> Date:   Thu Mar 1 23:04:59 2018 +
> 
> bpf: introduce -mattr=dwarfris to disable 
> DwarfUsesRelocationsAcrossSections
> 
> Commit e4507fb8c94b ("bpf: disable DwarfUsesRelocationsAcrossSections")
> disables MCAsmInfo DwarfUsesRelocationsAcrossSections unconditionally
> so that dwarf will not use cross section (between dwarf and symbol table)
> relocations. This new debug format enables pahole to dump structures
> correctly as libdwarves.so does not have BPF backend support yet.
> 
> This new debug format, however, breaks bcc 
> (https://github.com/iovisor/bcc)
> source debug output as llvm in-memory Dwarf support has some issues to
> handle it. More specifically, with DwarfUsesRelocationsAcrossSections
> disabled, JIT compiler does not generate .debug_abbrev and Dwarf
> DIE (debug info entry) processing is not happy about this.
> 
> This patch introduces a new flag 

Re: [bpf PATCH 4/6] bpf: sockmap, tcp_disconnect to listen transition

2018-06-13 Thread Martin KaFai Lau
On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote:
> After adding checks to ensure TCP is in ESTABLISHED state when a
> sock is added we need to also ensure that user does not transition
> through tcp_disconnect() and back into ESTABLISHED state without
> sockmap removing the sock.
> 
> To do this add unhash hook and remove sock from map there.
In bpf_tcp_init():
sk->sk_prot = _bpf_proto;

I may have missed a lock while reading sockmap.c.
Is it possible that tcp_disconnect() is being called while
the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)?
The same situation go for the ESTABLISHED check.

> 
> Reported-by: Eric Dumazet 
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
> ---
>  kernel/bpf/sockmap.c |   67 
> +-
>  1 file changed, 50 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 2e848cd..91c7b47 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -130,6 +130,7 @@ struct smap_psock {
>  
>   struct proto *sk_proto;
>   void (*save_close)(struct sock *sk, long timeout);
> + void (*save_unhash)(struct sock *sk);
>   void (*save_data_ready)(struct sock *sk);
>   void (*save_write_space)(struct sock *sk);
>  };
> @@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>   int offset, size_t size, int flags);
>  static void bpf_tcp_close(struct sock *sk, long timeout);
> +static void bpf_tcp_unhash(struct sock *sk);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -182,6 +184,7 @@ static void build_protos(struct proto 
> prot[SOCKMAP_NUM_CONFIGS],
>  {
>   prot[SOCKMAP_BASE]  = *base;
>   prot[SOCKMAP_BASE].close= bpf_tcp_close;
> + prot[SOCKMAP_BASE].unhash   = bpf_tcp_unhash;
>   prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
>   prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
>  
> @@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk)
>   }
>  
>   psock->save_close = sk->sk_prot->close;
> + psock->save_unhash = sk->sk_prot->unhash;
>   psock->sk_proto = sk->sk_prot;
>  
>   /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> @@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock 
> *sk,
>   return e;
>  }
>  
> -static void bpf_tcp_close(struct sock *sk, long timeout)
> +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
>  {
> - void (*close_fun)(struct sock *sk, long timeout);
>   struct smap_psock_map_entry *e;
>   struct sk_msg_buff *md, *mtmp;
> - struct smap_psock *psock;
>   struct sock *osk;
>  
> - rcu_read_lock();
> - psock = smap_psock_sk(sk);
> - if (unlikely(!psock)) {
> - rcu_read_unlock();
> - return sk->sk_prot->close(sk, timeout);
> - }
> -
> - /* The psock may be destroyed anytime after exiting the RCU critial
> -  * section so by the time we use close_fun the psock may no longer
> -  * be valid. However, bpf_tcp_close is called with the sock lock
> -  * held so the close hook and sk are still valid.
> -  */
> - close_fun = psock->save_close;
> -
>   if (psock->cork) {
>   free_start_sg(psock->sock, psock->cork);
>   kfree(psock->cork);
> @@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>   }
>   e = psock_map_pop(sk, psock);
>   }
> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> + void (*unhash_fun)(struct sock *sk);
> + struct smap_psock *psock;
> +
> + rcu_read_lock();
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock)) {
> + rcu_read_unlock();
> + return sk->sk_prot->unhash(sk);
> + }
> +
> + /* The psock may be destroyed anytime after exiting the RCU critial
> +  * section so by the time we use close_fun the psock may no longer
> +  * be valid. However, bpf_tcp_close is called with the sock lock
> +  * held so the close hook and sk are still valid.
> +  */
> + unhash_fun = psock->save_unhash;
> + bpf_tcp_remove(sk, psock);
> + rcu_read_unlock();
> + unhash_fun(sk);
> +
> +}
> +
> +static void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> + void (*close_fun)(struct sock *sk, long timeout);
> + struct smap_psock *psock;
> +
> + rcu_read_lock();
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock)) {
> + rcu_read_unlock();
> + return sk->sk_prot->close(sk, timeout);
> + }
> +
> + /* The psock may be destroyed anytime after exiting the RCU critial
> +  * section so by the 

Re: [PATCH bpf-next v5 00/10] BTF: BPF Type Format

2018-06-13 Thread Martin KaFai Lau
On Tue, Jun 12, 2018 at 05:41:26PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Jun 12, 2018 at 05:31:24PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Thu, Jun 07, 2018 at 01:07:01PM -0700, Martin KaFai Lau escreveu:
> > > On Thu, Jun 07, 2018 at 04:30:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > > > So this must be available in a newer llvm version? Which one?
> > 
> > > I should have put in the details in my last email or
> > > in the commit message, my bad.
> >  
> > > 1. The tools/testing/selftests/bpf/Makefile has the CLANG_FLAGS and
> > >LLC_FLAGS needed to compile the bpf prog.  It requires a new
> > >"-mattr=dwarf" llc option which was added to the future
> > >llvm 7.0.
> 
> > [root@jouet bpf]# pahole hello.o
> > struct clang version 5.0.1 (tags/RELEASE_501/final) {
> > clang version 5.0.1 (tags/RELEASE_501/final) clang version 5.0.1 
> > (tags/RELEASE_501/final); /* 0 4 */
> > clang version 5.0.1 (tags/RELEASE_501/final) clang version 5.0.1 
> > (tags/RELEASE_501/final); /* 4 4 */
> > clang version 5.0.1 (tags/RELEASE_501/final) clang version 5.0.1 
> > (tags/RELEASE_501/final); /* 8 4 */
> > clang version 5.0.1 (tags/RELEASE_501/final) clang version 5.0.1 
> > (tags/RELEASE_501/final); /*12 4 */
> > 
> > /* size: 16, cachelines: 1, members: 4 */
> > /* last cacheline: 16 bytes */
> > };
> > [root@jouet bpf]# 
> > 
> > Ok, I guess I saw this case in the llvm/clang git logs, so this one was
> > generated with the older clang, will regenerate and add that "-mattr=dwarf"
> > part.
> 
> [root@jouet bpf]# pahole hello.o
> struct clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_clang.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=_Qzsu689xEjjl9JvYCvJsIZLZZKDLB6rM-Uc0gqQvyg=
>  8c873daccce7ee5339b9fd82c81fe02b73543b65) 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_llvm.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=cFz6VP_YIYy_hubsx05WDqpTDyXl0Wnx_RAmAl1dbpg=
>  98c78e82f54be8fb0bb5f02e3ca674fbde10ef34) {
>   clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_clang.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=_Qzsu689xEjjl9JvYCvJsIZLZZKDLB6rM-Uc0gqQvyg=
>  8c873daccce7ee5339b9fd82c81fe02b73543b65) 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_llvm.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=cFz6VP_YIYy_hubsx05WDqpTDyXl0Wnx_RAmAl1dbpg=
>  98c78 clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_clang.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=_Qzsu689xEjjl9JvYCvJsIZLZZKDLB6rM-Uc0gqQvyg=
>  8c873daccce7ee5339b9fd82c81fe02b73543b65) 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_llvm.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=cFz6VP_YIYy_hubsx05WDqpTDyXl0Wnx_RAmAl1dbpg=
>  98c78e82f54be8fb0bb5f02e3ca674fbde10ef34); /* 0 4 */
>   clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_clang.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=_Qzsu689xEjjl9JvYCvJsIZLZZKDLB6rM-Uc0gqQvyg=
>  8c873daccce7ee5339b9fd82c81fe02b73543b65) 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_llvm.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=cFz6VP_YIYy_hubsx05WDqpTDyXl0Wnx_RAmAl1dbpg=
>  98c78 clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_clang.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=_Qzsu689xEjjl9JvYCvJsIZLZZKDLB6rM-Uc0gqQvyg=
>  8c873daccce7ee5339b9fd82c81fe02b73543b65) 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_git_llvm.git=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4=4d495SlcvobgBOFahId75gM-V2su4Qq2wiLOGkU-adI=cFz6VP_YIYy_hubsx05WDqpTDyXl0Wnx_RAmAl1dbpg=
>  98c78e82f54be8fb0bb5f02e3ca674fbde10ef34); /* 4 4 */
>   clang version 7.0.0 
> (https://urldefense.proofpoint.com/v2/url?u=http-3A__l

Re: [PATCH net] ipv6: allow PMTU exceptions to local routes

2018-06-11 Thread Martin KaFai Lau
On Mon, Jun 11, 2018 at 02:02:54AM +0300, Julian Anastasov wrote:
> IPVS setups with local client and remote tunnel server need
> to create exception for the local virtual IP. What we do is to
> change PMTU from 64KB (on "lo") to 1460 in the common case.
> 
> Suggested-by: Martin KaFai Lau 
> Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering 
> pmtu exception")
> Fixes: 7343ff31ebf0 ("ipv6: Don't create clones of host routes.")
> Signed-off-by: Julian Anastasov 
Acked-by: Martin KaFai Lau 


  1   2   3   4   5   6   7   8   >