Re: [PATCH bpf-next v4 0/3] add batched ops for percpu array

2021-04-16 Thread Martin KaFai Lau
On Thu, Apr 15, 2021 at 02:46:16PM -0300, Pedro Tammela wrote:
> This patchset introduces batched operations for the per-cpu variant of
> the array map.
> 
> It also removes the percpu macros from 'bpf_util.h'. This change was
> suggested by Andrii in a earlier iteration of this patchset.
> 
> The tests were updated to reflect all the new changes.
Acked-by: Martin KaFai Lau 


Re: [PATCH] tools/testing: Remove unused variable

2021-04-14 Thread Martin KaFai Lau
On Wed, Apr 14, 2021 at 10:16:39PM +0800, zuoqil...@163.com wrote:
> From: zuoqilin 
> 
> Remove unused variable "ret2".
Please tag the targeting branch in the future as described in
Documentation/bpf/bpf_devel_QA.rst.

This one belongs to bpf-next.

Acked-by: Martin KaFai Lau 


Re: [PATCH] selftests/bpf: Fix the ASSERT_ERR_PTR macro

2021-04-14 Thread Martin KaFai Lau
On Wed, Apr 14, 2021 at 05:56:32PM +0200, Florent Revest wrote:
> It is just missing a ';'. This macro is not used by any test yet.
> 
> Signed-off-by: Florent Revest 
Fixes: 22ba36351631 ("selftests/bpf: Move and extend ASSERT_xxx() testing 
macros")

Since it has not been used, it could be bpf-next.  Please also tag
it in the future.

Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v3 3/3] bpf: selftests: update array map tests for per-cpu batched ops

2021-04-12 Thread Martin KaFai Lau
On Mon, Apr 12, 2021 at 04:40:01PM -0300, Pedro Tammela wrote:
> Follows the same logic as the hashtable tests.
> 
> Signed-off-by: Pedro Tammela 
> ---
>  .../bpf/map_tests/array_map_batch_ops.c   | 110 +-
>  1 file changed, 80 insertions(+), 30 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c 
> b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
> index e42ea1195d18..707d17414dee 100644
> --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
> +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
> @@ -10,32 +10,59 @@
>  #include 
>  
>  static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
> -  int *values)
> +  __s64 *values, bool is_pcpu)
>  {
> - int i, err;
> + int nr_cpus = libbpf_num_possible_cpus();
Instead of getting it multiple times, how about moving it out to
a static global and initialize it in test_array_map_batch_ops().


> + int i, j, err;
> + int offset = 0;
>   DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
>   .elem_flags = 0,
>   .flags = 0,
>   );
>  
> + CHECK(nr_cpus < 0, "nr_cpus checking",
> +   "error: get possible cpus failed");
> +
>   for (i = 0; i < max_entries; i++) {
>   keys[i] = i;
> - values[i] = i + 1;
> + if (is_pcpu)
> + for (j = 0; j < nr_cpus; j++)
> + (values + offset)[j] = i + 1 + j;
> + else
> + values[i] = i + 1;
> + offset += nr_cpus;
This "offset" update here is confusing to read because it is only
used in the is_pcpu case but it always gets updated regardless.
How about only defines and uses offset in the "if (is_pcpu)" case and
rename it to "cpu_offset": cpu_offset = i * nr_cpus.

The same goes for other occasions.

>   }
>  
>   err = bpf_map_update_batch(map_fd, keys, values, _entries, );
>   CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
>  }
>  
> -static void map_batch_verify(int *visited, __u32 max_entries,
> -  int *keys, int *values)
> +static void map_batch_verify(int *visited, __u32 max_entries, int *keys,
> +  __s64 *values, bool is_pcpu)
>  {
> - int i;
> + int nr_cpus = libbpf_num_possible_cpus();
> + int i, j;
> + int offset = 0;
> +
> + CHECK(nr_cpus < 0, "nr_cpus checking",
> +   "error: get possible cpus failed");
>  
>   memset(visited, 0, max_entries * sizeof(*visited));
>   for (i = 0; i < max_entries; i++) {
> - CHECK(keys[i] + 1 != values[i], "key/value checking",
> -   "error: i %d key %d value %d\n", i, keys[i], values[i]);
> + if (is_pcpu) {
> + for (j = 0; j < nr_cpus; j++) {
> + __s64 value = (values + offset)[j];
> + CHECK(keys[i] + j + 1 != value,
> +   "key/value checking",
> +   "error: i %d j %d key %d value %d\n", i,
> +   j, keys[i], value);
> + }
> + } else {
> + CHECK(keys[i] + 1 != values[i], "key/value checking",
> +   "error: i %d key %d value %d\n", i, keys[i],
> +   values[i]);
> + }
> + offset += nr_cpus;
>   visited[i] = 1;
>   }
>   for (i = 0; i < max_entries; i++) {
> @@ -44,45 +71,52 @@ static void map_batch_verify(int *visited, __u32 
> max_entries,
>   }
>  }
>  
> -void test_array_map_batch_ops(void)
> +void __test_map_lookup_and_update_batch(bool is_pcpu)
static

>  {
> + int nr_cpus = libbpf_num_possible_cpus();
>   struct bpf_create_map_attr xattr = {
>   .name = "array_map",
> - .map_type = BPF_MAP_TYPE_ARRAY,
> + .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY :
> +   BPF_MAP_TYPE_ARRAY,
>   .key_size = sizeof(int),
> - .value_size = sizeof(int),
> + .value_size = sizeof(__s64),
>   };
> - int map_fd, *keys, *values, *visited;
> + int map_fd, *keys, *visited;
>   __u32 count, total, total_success;
>   const __u32 max_entries = 10;
>   __u64 batch = 0;
> - int err, step;
> + int err, step, value_size;
> + void *values;
>   DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
>   .elem_flags = 0,
>   .flags = 0,
>   );
>  
> + CHECK(nr_cpus < 0, "nr_cpus checking",
> +   "error: get possible cpus failed");
> +
>   xattr.max_entries = max_entries;
>   map_fd = bpf_create_map_xattr();
>   CHECK(map_fd == -1,
> "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
>  
> - keys = 

Re: [PATCH v6 bpf-next 0/6] bpf: enable task local storage for tracing programs

2021-02-25 Thread Martin KaFai Lau
On Thu, Feb 25, 2021 at 03:43:13PM -0800, Song Liu wrote:
> This set enables task local storage for non-BPF_LSM programs.
> 
> It is common for tracing BPF program to access per-task data. Currently,
> these data are stored in hash tables with pid as the key. In
> bcc/libbpftools [1], 9 out of 23 tools use such hash tables. However,
> hash table is not ideal for many use case. Task local storage provides
> better usability and performance for BPF programs. Please refer to 6/6 for
> some performance comparison of task local storage vs. hash table.
Thanks for the patches.

Acked-by: Martin KaFai Lau 


Re: [PATCH v6 bpf-next 2/6] bpf: prevent deadlock from recursive bpf_task_storage_[get|delete]

2021-02-25 Thread Martin KaFai Lau
On Thu, Feb 25, 2021 at 03:43:15PM -0800, Song Liu wrote:
> BPF helpers bpf_task_storage_[get|delete] could hold two locks:
> bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling
> these helpers from fentry/fexit programs on functions in bpf_*_storage.c
> may cause deadlock on either locks.
> 
> Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We
> need this counter to be global, because the two locks here belong to two
> different objects: bpf_local_storage_map and bpf_local_storage. If we
> pick one of them as the owner of the counter, it is still possible to
> trigger deadlock on the other lock. For example, if bpf_local_storage_map
> owns the counters, it cannot prevent deadlock on bpf_local_storage->lock
> when two maps are used.
Acked-by: Martin KaFai Lau 


Re: [PATCH v5 bpf-next 1/6] bpf: enable task local storage for tracing programs

2021-02-25 Thread Martin KaFai Lau
On Tue, Feb 23, 2021 at 02:28:40PM -0800, Song Liu wrote:
> To access per-task data, BPF programs usually creates a hash table with
> pid as the key. This is not ideal because:
>  1. The user need to estimate the proper size of the hash table, which may
> be inaccurate;
>  2. Big hash tables are slow;
>  3. To clean up the data properly during task terminations, the user need
> to write extra logic.
> 
> Task local storage overcomes these issues and offers a better option for
> these per-task data. Task local storage is only available to BPF_LSM. Now
> enable it for tracing programs.
> 
> Unlike LSM programs, tracing programs can be called in IRQ contexts.
> Helpers that access task local storage are updated to use
> raw_spin_lock_irqsave() instead of raw_spin_lock_bh().
> 
> Tracing programs can attach to functions on the task free path, e.g.
> exit_creds(). To avoid allocating task local storage after
> bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate
> new storage when the task is not refcounted (task->usage == 0).
Acked-by: Martin KaFai Lau 


Re: [PATCH v4 bpf-next 6/6] bpf: runqslower: use task local storage

2021-02-23 Thread Martin KaFai Lau
On Mon, Feb 22, 2021 at 05:20:14PM -0800, Song Liu wrote:
> Replace hashtab with task local storage in runqslower. This improves the
> performance of these BPF programs. The following table summarizes average
> runtime of these programs, in nanoseconds:
> 
>   task-local   hash-prealloc   hash-no-prealloc
> handle__sched_wakeup 125 340   3124
> handle__sched_wakeup_new28121510   2998
> handle__sched_switch 151 208991
Nice!  The required code change is also minimal.


Re: [PATCH v4 bpf-next 5/6] bpf: runqslower: prefer using local vmlimux to generate vmlinux.h

2021-02-23 Thread Martin KaFai Lau
On Mon, Feb 22, 2021 at 05:20:13PM -0800, Song Liu wrote:
> Update the Makefile to prefer using $(O)/mvlinux, $(KBUILD_OUTPUT)/vmlinux
s/mvlinux/vmlinux/


Re: [PATCH v4 bpf-next 1/6] bpf: enable task local storage for tracing programs

2021-02-23 Thread Martin KaFai Lau
On Mon, Feb 22, 2021 at 05:20:09PM -0800, Song Liu wrote:
[ ... ]

> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index e0da0258b732d..2034019966d44 100644
> --- a/kernel/bpf/bpf_task_storage.c
> +++ b/kernel/bpf/bpf_task_storage.c
> @@ -15,7 +15,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  
> @@ -24,12 +23,8 @@ DEFINE_BPF_STORAGE_CACHE(task_cache);
>  static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
>  {
>   struct task_struct *task = owner;
> - struct bpf_storage_blob *bsb;
>  
> - bsb = bpf_task(task);
> - if (!bsb)
> - return NULL;
task_storage_ptr() no longer returns NULL.  All "!task_storage_ptr(task)"
checks should be removed also.  e.g. In bpf_task_storage_get
and bpf_pid_task_storage_update_elem.

> - return >storage;
> + return >bpf_storage;
>  }
>  


Re: [PATCH] bpf_lru_list: Read double-checked variable once without lock

2021-02-09 Thread Martin KaFai Lau
On Tue, Feb 09, 2021 at 12:27:01PM +0100, Marco Elver wrote:
> For double-checked locking in bpf_common_lru_push_free(), node->type is
> read outside the critical section and then re-checked under the lock.
> However, concurrent writes to node->type result in data races.
> 
> For example, the following concurrent access was observed by KCSAN:
> 
>   write to 0x88801521bc22 of 1 bytes by task 10038 on cpu 1:
>__bpf_lru_node_move_inkernel/bpf/bpf_lru_list.c:91
>__local_list_flushkernel/bpf/bpf_lru_list.c:298
>...
>   read to 0x88801521bc22 of 1 bytes by task 10043 on cpu 0:
>bpf_common_lru_push_free  kernel/bpf/bpf_lru_list.c:507
>bpf_lru_push_free kernel/bpf/bpf_lru_list.c:555
>...
> 
> Fix the data races where node->type is read outside the critical section
> (for double-checked locking) by marking the access with READ_ONCE() as
> well as ensuring the variable is only accessed once.
> 
> Reported-by: syzbot+3536db46dfa58c573...@syzkaller.appspotmail.com
> Reported-by: syzbot+516acdb03d3e27d91...@syzkaller.appspotmail.com
> Signed-off-by: Marco Elver 
> ---
> Detailed reports:
>   
> https://groups.google.com/g/syzkaller-upstream-moderation/c/PwsoQ7bfi8k/m/NH9Ni2WxAQAJ
>  
>   
> https://groups.google.com/g/syzkaller-upstream-moderation/c/-fXQO9ehxSM/m/RmQEcI2oAQAJ
>  
> ---
>  kernel/bpf/bpf_lru_list.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> index 1b6b9349cb85..d99e89f113c4 100644
> --- a/kernel/bpf/bpf_lru_list.c
> +++ b/kernel/bpf/bpf_lru_list.c
> @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru 
> *lru, u32 hash)
>  static void bpf_common_lru_push_free(struct bpf_lru *lru,
>struct bpf_lru_node *node)
>  {
> + u8 node_type = READ_ONCE(node->type);
>   unsigned long flags;
>  
> - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
> - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
> + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
> + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
>   return;
>  
> - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
> + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
I think this can be bpf-next.

Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs

2021-01-12 Thread Martin KaFai Lau
On Mon, Jan 11, 2021 at 03:41:26PM -0800, Song Liu wrote:
> 
> 
> > On Jan 11, 2021, at 10:56 AM, Martin Lau  wrote:
> > 
> > On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> > 
> > [ ... ]
> > 
> >> diff --git a/kernel/bpf/bpf_local_storage.c 
> >> b/kernel/bpf/bpf_local_storage.c
> >> index dd5aedee99e73..9bd47ad2b26f1 100644
> >> --- a/kernel/bpf/bpf_local_storage.c
> >> +++ b/kernel/bpf/bpf_local_storage.c
> >> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct 
> >> bpf_local_storage_elem *selem)
> >> {
> >>struct bpf_local_storage *local_storage;
> >>bool free_local_storage = false;
> >> +  unsigned long flags;
> >> 
> >>if (unlikely(!selem_linked_to_storage(selem)))
> >>/* selem has already been unlinked from sk */
> >>return;
> >> 
> >>local_storage = rcu_dereference(selem->local_storage);
> >> -  raw_spin_lock_bh(_storage->lock);
> >> +  raw_spin_lock_irqsave(_storage->lock, flags);
> > It will be useful to have a few words in commit message on this change
> > for future reference purpose.
> > 
> > Please also remove the in_irq() check from bpf_sk_storage.c
> > to avoid confusion in the future.  It probably should
> > be in a separate patch.
> 
> Do you mean we allow bpf_sk_storage_get_tracing() and 
> bpf_sk_storage_delete_tracing() in irq context? Like
Right.

However, after another thought, may be lets skip that for now
till a use case comes up and a test can be written.


Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs

2021-01-11 Thread Martin KaFai Lau
On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau  wrote:
> >
> > On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> >
> > [ ... ]
> >
> > > diff --git a/kernel/bpf/bpf_local_storage.c 
> > > b/kernel/bpf/bpf_local_storage.c
> > > index dd5aedee99e73..9bd47ad2b26f1 100644
> > > --- a/kernel/bpf/bpf_local_storage.c
> > > +++ b/kernel/bpf/bpf_local_storage.c
> > > @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct 
> > > bpf_local_storage_elem *selem)
> > >  {
> > >   struct bpf_local_storage *local_storage;
> > >   bool free_local_storage = false;
> > > + unsigned long flags;
> > >
> > >   if (unlikely(!selem_linked_to_storage(selem)))
> > >   /* selem has already been unlinked from sk */
> > >   return;
> > >
> > >   local_storage = rcu_dereference(selem->local_storage);
> > > - raw_spin_lock_bh(_storage->lock);
> > > + raw_spin_lock_irqsave(_storage->lock, flags);
> > It will be useful to have a few words in commit message on this change
> > for future reference purpose.
> >
> > Please also remove the in_irq() check from bpf_sk_storage.c
> > to avoid confusion in the future.  It probably should
> > be in a separate patch.
> >
> > [ ... ]
> >
> > > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > > index 4ef1959a78f27..f654b56907b69 100644
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 7425b3224891d..3d65c8ebfd594 100644
> > [ ... ]
> >
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -96,6 +96,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #include 
> > >  #include 
> > > @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
> > >   cgroup_free(tsk);
> > >   task_numa_free(tsk, true);
> > >   security_task_free(tsk);
> > > + bpf_task_storage_free(tsk);
> > >   exit_creds(tsk);
> > If exit_creds() is traced by a bpf and this bpf is doing
> > bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
> > new task storage will be created after bpf_task_storage_free().
> >
> > I recalled there was an earlier discussion with KP and KP mentioned
> > BPF_LSM will not be called with a task that is going away.
> > It seems enabling bpf task storage in bpf tracing will break
> > this assumption and needs to be addressed?
> 
> For tracing programs, I think we will need an allow list where
> task local storage can be used.
Instead of whitelist, can refcount_inc_not_zero(>usage) be used?


Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs

2021-01-11 Thread Martin KaFai Lau
On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:

[ ... ]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index dd5aedee99e73..9bd47ad2b26f1 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct 
> bpf_local_storage_elem *selem)
>  {
>   struct bpf_local_storage *local_storage;
>   bool free_local_storage = false;
> + unsigned long flags;
>  
>   if (unlikely(!selem_linked_to_storage(selem)))
>   /* selem has already been unlinked from sk */
>   return;
>  
>   local_storage = rcu_dereference(selem->local_storage);
> - raw_spin_lock_bh(_storage->lock);
> + raw_spin_lock_irqsave(_storage->lock, flags);
It will be useful to have a few words in commit message on this change
for future reference purpose.

Please also remove the in_irq() check from bpf_sk_storage.c
to avoid confusion in the future.  It probably should
be in a separate patch.

[ ... ]

> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index 4ef1959a78f27..f654b56907b69 100644
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 7425b3224891d..3d65c8ebfd594 100644
[ ... ]

> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -96,6 +96,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>   cgroup_free(tsk);
>   task_numa_free(tsk, true);
>   security_task_free(tsk);
> + bpf_task_storage_free(tsk);
>   exit_creds(tsk);
If exit_creds() is traced by a bpf and this bpf is doing
bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
new task storage will be created after bpf_task_storage_free().

I recalled there was an earlier discussion with KP and KP mentioned
BPF_LSM will not be called with a task that is going away.
It seems enabling bpf task storage in bpf tracing will break
this assumption and needs to be addressed?


Re: [PATCH bpf] bpftool: fix compilation failure for net.o with older glibc

2021-01-06 Thread Martin KaFai Lau
On Wed, Jan 06, 2021 at 03:59:06PM +, Alan Maguire wrote:
> For older glibc ~2.17, #include'ing both linux/if.h and net/if.h
> fails due to complaints about redefinition of interface flags:
> 
>   CC   net.o
> In file included from net.c:13:0:
> /usr/include/linux/if.h:71:2: error: redeclaration of enumerator ‘IFF_UP’
>   IFF_UP= 1<<0,  /* sysfs */
>   ^
> /usr/include/net/if.h:44:5: note: previous definition of ‘IFF_UP’ was here
>  IFF_UP = 0x1,  /* Interface is up.  */
> 
> The issue was fixed in kernel headers in [1], but since compilation
> of net.c picks up system headers the problem can recur.
> 
> Dropping #include  resolves the issue and it is
> not needed for compilation anyhow.
Acked-by: Martin KaFai Lau 


Re: [PATCH] selftests/bpf: remove duplicate include in test_lsm

2021-01-05 Thread Martin KaFai Lau
On Tue, Jan 05, 2021 at 07:20:47AM -0800, menglong8.d...@gmail.com wrote:
> From: Menglong Dong 
> 
> 'unistd.h' included in 'selftests/bpf/prog_tests/test_lsm.c' is
> duplicated.
It is for bpf-next.  Please put a proper tag next time.

Acked-by: Martin KaFai Lau 


Re: [PATCH v3 0/4] btf: support ints larger than 128 bits

2021-01-05 Thread Martin KaFai Lau
On Tue, Jan 05, 2021 at 02:45:30PM +, Sean Young wrote:
> clang supports arbitrary length ints using the _ExtInt extension. This
> can be useful to hold very large values, e.g. 256 bit or 512 bit types.
> 
> Larger types (e.g. 1024 bits) are possible but I am unaware of a use
> case for these.
> 
> This requires the _ExtInt extension enabled in clang, which is under
> review.
1. Please explain the use case.
2. All patches have the same commit message which is not useful.
   Please spend some time in the commit message to explain what each
   individual patch does.
3. The test_extint.py is mostly a copy-and-paste from the existing
   test_offload.py?  Does it need most of the test_offload.py
   to test the BTF 256/512 bit int?  Please create a minimal
   test and use the test_progs.c infra-structure.


Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-16 Thread Martin KaFai Lau
On Thu, Dec 17, 2020 at 01:41:58AM +0900, Kuniyuki Iwashima wrote:
[ ... ]

> > There may also be places assuming that the req->rsk_listener will never
> > change once it is assigned.  not sure.  have not looked closely yet.
> 
> I have checked this again. There are no functions that expect explicitly
> req->rsk_listener never change except for BUG_ON in inet_child_forget().
> No BUG_ON/WARN_ON does not mean they does not assume listener never
> change, but such functions still work properly if rsk_listener is changed.
The migration not only changes the ptr value of req->rsk_listener, it also
means req is moved to another listener. (e.g. by updating the qlen of
the old sk and new sk)

Lets reuse the example about two cores at the TCP_NEW_SYN_RECV path
racing to finish up the 3WHS.

One core is already at inet_csk_complete_hashdance() doing
"reqsk_queue_removed(_csk(sk)->icsk_accept_queue, req))".
What happen if another core migrates the req to another listener?
Would the "reqsk_queue_removed(_csk(sk)->icsk_accept_queue, req))"
doing thing on the accept_queue that this req no longer belongs to?

Also, from a quick look at reqsk_timer_handler() on how
queue->young and req->num_timeout are updated, I am not sure
the reqsk_queue_migrated() will work also:

+static inline void reqsk_queue_migrated(struct request_sock_queue 
*old_accept_queue,
+   struct request_sock_queue 
*new_accept_queue,
+   const struct request_sock *req)
+{
+   atomic_dec(_accept_queue->qlen);
+   atomic_inc(_accept_queue->qlen);
+
+   if (req->num_timeout == 0) {
What if reqsk_timer_handler() is running in parallel
and updating req->num_timeout?

+   atomic_dec(_accept_queue->young);
+   atomic_inc(_accept_queue->young);
+   }
+}


It feels like some of the "own_req" related logic may be useful here.
not sure.  could be something worth to think about.

> 
> 
> > It probably needs some more thoughts here to get a simpler solution.
> 
> Is it fine to move sock_hold() before assigning rsk_listener and defer
> sock_put() to the end of tcp_v[46]_rcv() ?
I don't see how this ordering helps, considering the migration can happen
any time at another core.

> 
> Also, we have to rewrite rsk_listener first and then call sock_put() in
> reqsk_timer_handler() so that rsk_listener always has refcount more than 1.
> 
> ---8<---
>   struct sock *nsk, *osk;
>   bool migrated = false;
>   ...
>   sock_hold(req->rsk_listener);  // (i)
>   sk = req->rsk_listener;
>   ...
>   if (sk->sk_state == TCP_CLOSE) {
>   osk = sk;
>   // do migration without sock_put()
>   sock_hold(nsk);  // (ii) (as with (i))
>   sk = nsk;
>   migrated = true;
>   }
>   ...
>   if (migrated) {
>   sock_put(sk);  // pair with (ii)
>   sock_put(osk); // decrement old listener's refcount
>   sk = osk;
>   }
>   sock_put(sk);  // pair with (i)
> ---8<---


Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-14 Thread Martin KaFai Lau
On Tue, Dec 15, 2020 at 02:03:13AM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Thu, 10 Dec 2020 10:49:15 -0800
> > On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:
> > > From:   Martin KaFai Lau 
> > > Date:   Wed, 9 Dec 2020 16:07:07 -0800
> > > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> > > > > This patch renames reuseport_select_sock() to 
> > > > > __reuseport_select_sock() and
> > > > > adds two wrapper function of it to pass the migration type defined in 
> > > > > the
> > > > > previous commit.
> > > > > 
> > > > >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> > > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > > > > 
> > > > > As mentioned before, we have to select a new listener for 
> > > > > TCP_NEW_SYN_RECV
> > > > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, 
> > > > > this
> > > > > patch also changes the code to call reuseport_select_migrated_sock() 
> > > > > even
> > > > > if the listening socket is TCP_CLOSE. If we can pick out a listening 
> > > > > socket
> > > > > from the reuseport group, we rewrite request_sock.rsk_listener and 
> > > > > resume
> > > > > processing the request.
> > > > > 
> > > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > > Signed-off-by: Kuniyuki Iwashima 
> > > > > ---
> > > > >  include/net/inet_connection_sock.h | 12 +++
> > > > >  include/net/request_sock.h | 13 
> > > > >  include/net/sock_reuseport.h   |  8 +++
> > > > >  net/core/sock_reuseport.c  | 34 
> > > > > --
> > > > >  net/ipv4/inet_connection_sock.c| 13 ++--
> > > > >  net/ipv4/tcp_ipv4.c|  9 ++--
> > > > >  net/ipv6/tcp_ipv6.c|  9 ++--
> > > > >  7 files changed, 81 insertions(+), 17 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/inet_connection_sock.h 
> > > > > b/include/net/inet_connection_sock.h
> > > > > index 2ea2d743f8fc..1e0958f5eb21 100644
> > > > > --- a/include/net/inet_connection_sock.h
> > > > > +++ b/include/net/inet_connection_sock.h
> > > > > @@ -272,6 +272,18 @@ static inline void 
> > > > > inet_csk_reqsk_queue_added(struct sock *sk)
> > > > >   reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> > > > >  }
> > > > >  
> > > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > > > > +  struct sock *nsk,
> > > > > +  struct request_sock 
> > > > > *req)
> > > > > +{
> > > > > + reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > > > > +  _csk(nsk)->icsk_accept_queue,
> > > > > +  req);
> > > > > + sock_put(sk);
> > > > not sure if it is safe to do here.
> > > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
> > > > to req->rsk_listener such that sock_hold(req->rsk_listener) is
> > > > safe because its sk_refcnt is not zero.
> > > 
> > > I think it is safe to call sock_put() for the old listener here.
> > > 
> > > Without this patchset, at receiving the final ACK or retransmitting
> > > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
> > > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().
> > Note that in your example (final ACK), sock_put(req->rsk_listener) is
> > _only_ called when reqsk_put() can get 
> > refcount_dec_and_test(>rsk_refcnt)
> > to reach zero.
> > 
> > Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt
> > reaching zero.
> > 
> > Let says there are two cores holding two refcnt to req (one cnt for each 
> > core)
> > by looking up the req from ehash.  One of the core do this migrate and
> > sock_put(req->rsk_listener).  Another core does 
> > sock_hold(req->rsk_listener).
> > 
> > Core1

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-10 Thread Martin KaFai Lau
On Thu, Dec 10, 2020 at 02:58:10PM +0900, Kuniyuki Iwashima wrote:

[ ... ]

> > > I've implemented one-by-one migration only for the accept queue for now.
> > > In addition to the concern about TFO queue,
> > You meant this queue:  queue->fastopenq.rskq_rst_head?
> 
> Yes.
> 
> 
> > Can "req" be passed?
> > I did not look up the lock/race in details for that though.
> 
> I think if we rewrite freeing TFO requests part like one of accept queue
> using reqsk_queue_remove(), we can also migrate them.
> 
> In this patchset, selecting a listener for accept queue, the TFO queue of
> the same listener is also migrated to another listener in order to prevent
> TFO spoofing attack.
> 
> If the request in the accept queue is migrated one by one, I am wondering
> which should the request in TFO queue be migrated to prevent attack or
> freed.
> 
> I think user need not know about keeping such requests in kernel to prevent
> attacks, so passing them to eBPF prog is confusing. But, redistributing
> them randomly without user's intention can make some irrelevant listeners
> unnecessarily drop new TFO requests, so this is also bad. Moreover, freeing
> such requests seems not so good in the point of security.
The current behavior (during process restart) is also not carrying this
security queue.  Not carrying them in this patch will make it
less secure than the current behavior during process restart?
Do you need it now or it is something that can be considered for later
without changing uapi bpf.h?

> > > ---8<---
> > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > b/net/ipv4/inet_connection_sock.c
> > > index a82fd4c912be..d0ddd3cb988b 100644
> > > --- a/net/ipv4/inet_connection_sock.c
> > > +++ b/net/ipv4/inet_connection_sock.c
> > > @@ -1001,6 +1001,29 @@ struct sock *inet_csk_reqsk_queue_add(struct sock 
> > > *sk,
> > >  }
> > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > >  
> > > +static bool inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock 
> > > *nsk, struct request_sock *req)
> > > +{
> > > +   struct request_sock_queue *queue = 
> > > _csk(nsk)->icsk_accept_queue;
> > > +   bool migrated = false;
> > > +
> > > +   spin_lock(>rskq_lock);
> > > +   if (likely(nsk->sk_state == TCP_LISTEN)) {
> > > +   migrated = true;
> > > +
> > > +   req->dl_next = NULL;
> > > +   if (queue->rskq_accept_head == NULL)
> > > +   WRITE_ONCE(queue->rskq_accept_head, req);
> > > +   else
> > > +   queue->rskq_accept_tail->dl_next = req;
> > > +   queue->rskq_accept_tail = req;
> > > +   sk_acceptq_added(nsk);
> > > +   inet_csk_reqsk_queue_migrated(sk, nsk, req);
> > need to first resolve the question raised in patch 5 regarding
> > to the update on req->rsk_listener though.
> 
> In the unhash path, it is also safe to call sock_put() for the old listner.
> 
> In inet_csk_listen_stop(), the sk_refcnt of the listener >= 1. If the
> listener does not have immature requests, sk_refcnt is 1 and freed in
> __tcp_close().
> 
>   sock_hold(sk) in __tcp_close()
>   sock_put(sk) in inet_csk_destroy_sock()
>   sock_put(sk) in __tcp_clsoe()
I don't see how it is different here than in patch 5.
I could be missing something.

Lets contd the discussion on the other thread (patch 5) first.

> 
> 
> > > +   }
> > > +   spin_unlock(>rskq_lock);
> > > +
> > > +   return migrated;
> > > +}
> > > +
> > >  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock 
> > > *child,
> > >  struct request_sock *req, bool 
> > > own_req)
> > >  {
> > > @@ -1023,9 +1046,11 @@ EXPORT_SYMBOL(inet_csk_complete_hashdance);
> > >   */
> > >  void inet_csk_listen_stop(struct sock *sk)
> > >  {
> > > +   struct sock_reuseport *reuseport_cb = 
> > > rcu_access_pointer(sk->sk_reuseport_cb);
> > > struct inet_connection_sock *icsk = inet_csk(sk);
> > > struct request_sock_queue *queue = >icsk_accept_queue;
> > > struct request_sock *next, *req;
> > > +   struct sock *nsk;
> > >  
> > > /* Following specs, it would be better either to send FIN
> > >  * (and enter FIN-WAIT-1, it is normal close)
> > > @@ -1043,8 +1068,19 @@ void inet_csk_listen_stop(struct sock *sk)
> > > WARN_ON(sock_owned_by_user(child));
> > > sock_hold(child);
> > >  
> > > +   if (reuseport_cb) {
> > > +   nsk = reuseport_select_migrated_sock(sk, 
> > > req_to_sk(req)->sk_hash, NULL);
> > > +   if (nsk) {
> > > +   if (inet_csk_reqsk_queue_migrate(sk, nsk, 
> > > req))
> > > +   goto unlock_sock;
> > > +   else
> > > +   sock_put(nsk);
> > > +   }
> > > +   }
> > > +
> > > 

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-10 Thread Martin KaFai Lau
On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Wed, 9 Dec 2020 16:07:07 -0800
> > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> > > This patch renames reuseport_select_sock() to __reuseport_select_sock() 
> > > and
> > > adds two wrapper function of it to pass the migration type defined in the
> > > previous commit.
> > > 
> > >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > > 
> > > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
> > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
> > > patch also changes the code to call reuseport_select_migrated_sock() even
> > > if the listening socket is TCP_CLOSE. If we can pick out a listening 
> > > socket
> > > from the reuseport group, we rewrite request_sock.rsk_listener and resume
> > > processing the request.
> > > 
> > > Reviewed-by: Benjamin Herrenschmidt 
> > > Signed-off-by: Kuniyuki Iwashima 
> > > ---
> > >  include/net/inet_connection_sock.h | 12 +++
> > >  include/net/request_sock.h | 13 
> > >  include/net/sock_reuseport.h   |  8 +++
> > >  net/core/sock_reuseport.c  | 34 --
> > >  net/ipv4/inet_connection_sock.c| 13 ++--
> > >  net/ipv4/tcp_ipv4.c|  9 ++--
> > >  net/ipv6/tcp_ipv6.c|  9 ++--
> > >  7 files changed, 81 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/include/net/inet_connection_sock.h 
> > > b/include/net/inet_connection_sock.h
> > > index 2ea2d743f8fc..1e0958f5eb21 100644
> > > --- a/include/net/inet_connection_sock.h
> > > +++ b/include/net/inet_connection_sock.h
> > > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct 
> > > sock *sk)
> > >   reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> > >  }
> > >  
> > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > > +  struct sock *nsk,
> > > +  struct request_sock *req)
> > > +{
> > > + reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > > +  _csk(nsk)->icsk_accept_queue,
> > > +  req);
> > > + sock_put(sk);
> > not sure if it is safe to do here.
> > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
> > to req->rsk_listener such that sock_hold(req->rsk_listener) is
> > safe because its sk_refcnt is not zero.
> 
> I think it is safe to call sock_put() for the old listener here.
> 
> Without this patchset, at receiving the final ACK or retransmitting
> SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
> by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().
Note that in your example (final ACK), sock_put(req->rsk_listener) is
_only_ called when reqsk_put() can get refcount_dec_and_test(>rsk_refcnt)
to reach zero.

Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt
reaching zero.

Let says there are two cores holding two refcnt to req (one cnt for each core)
by looking up the req from ehash.  One of the core do this migrate and
sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).

Core1   Core2
sock_put(req->rsk_listener)

sock_hold(req->rsk_listener)

> And then, we do `goto lookup;` and overwrite the sk.
> 
> In the v2 patchset, refcount_inc_not_zero() is done for the new listener in
> reuseport_select_migrated_sock(), so we have to call sock_put() for the old
> listener instead to free it properly.
> 
> ---8<---
> +struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
> + struct sk_buff *skb)
> +{
> + struct sock *nsk;
> +
> + nsk = __reuseport_select_sock(sk, hash, skb, 0, 
> BPF_SK_REUSEPORT_MIGRATE_REQUEST);
> + if (nsk && likely(refcount_inc_not_zero(>sk_refcnt)))
There is another potential issue here.  The TCP_LISTEN nsk is protected
by rcu.  refcount_inc_not_zero(>sk_refcnt) cannot be done if it
is not under rcu_read_lock().

The receive path may be ok as it is in rcu.  You may need to check for
others.

> + return nsk;
> +
> + return NULL;
> +}
> +EXPORT_SYMBOL(reuseport_select_migrated_sock);
> ---8<---
> https://lore.kernel.org/netdev/20201207132456.65472-8-kun...@amazon.co.jp/
> 
> 
> > > + sock_hold(nsk);
> > > + req->rsk_listener = nsk;
It looks like there is another race here.  What
if multiple cores try to update req->rsk_listener?


Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-09 Thread Martin KaFai Lau
On Thu, Dec 10, 2020 at 01:57:19AM +0900, Kuniyuki Iwashima wrote:
[ ... ]

> > > > I think it is a bit complex to pass the new listener from
> > > > reuseport_detach_sock() to inet_csk_listen_stop().
> > > > 
> > > > __tcp_close/tcp_disconnect/tcp_abort
> > > >  |-tcp_set_state
> > > >  |  |-unhash
> > > >  | |-reuseport_detach_sock (return nsk)
> > > >  |-inet_csk_listen_stop
> > > Picking the new listener does not have to be done in
> > > reuseport_detach_sock().
> > > 
> > > IIUC, it is done there only because it prefers to pick
> > > the last sk from socks[] when bpf prog is not attached.
> > > This seems to get into the way of exploring other potential
> > > implementation options.
> > 
> > Yes.
> > This is just idea, but we can reserve the last index of socks[] to hold the
> > last 'moved' socket in reuseport_detach_sock() and use it in
> > inet_csk_listen_stop().
> > 
> > 
> > > Merging the discussion on the last socks[] pick from another thread:
> > > >
> > > > I think most applications start new listeners before closing listeners, 
> > > > in
> > > > this case, selecting the moved socket as the new listener works well.
> > > >
> > > >
> > > > > That said, if it is still desired to do a random pick by kernel when
> > > > > there is no bpf prog, it probably makes sense to guard it in a sysctl 
> > > > > as
> > > > > suggested in another reply.  To keep it simple, I would also keep this
> > > > > kernel-pick consistent instead of request socket is doing something
> > > > > different from the unhash path.
> > > >
> > > > Then, is this way better to keep kernel-pick consistent?
> > > >
> > > >   1. call reuseport_select_migrated_sock() without sk_hash from any path
> > > >   2. generate a random number in reuseport_select_migrated_sock()
> > > >   3. pass it to __reuseport_select_sock() only for select-by-hash
> > > >   (4. pass 0 as sk_hash to bpf_run_sk_reuseport not to use it)
> > > >   5. do migration per queue in inet_csk_listen_stop() or per request in
> > > >  receive path.
> > > >
> > > > I understand it is beautiful to keep consistensy, but also think
> > > > the kernel-pick with heuristic performs better than random-pick.
> > > I think discussing the best kernel pick without explicit user input
> > > is going to be a dead end. There is always a case that
> > > makes this heuristic (or guess) fail.  e.g. what if multiple
> > > sk(s) being closed are always the last one in the socks[]?
> > > all their child sk(s) will then be piled up at one listen sk
> > > because the last socks[] is always picked?
> > 
> > There can be such a case, but it means the newly listened sockets are
> > closed earlier than old ones.
> > 
> > 
> > > Lets assume the last socks[] is indeed the best for all cases.  Then why
> > > the in-progress req don't pick it this way?  I feel the implementation
> > > is doing what is convenient at that point.  And that is fine, I think
> > 
> > In this patchset, I originally assumed four things:
> > 
> >   migration should be done
> > (i)   from old to new
> > (ii)  to redistribute requests evenly as possible
> > (iii) to keep the order of requests in the queue
> >   (resulting in splicing queues)
> > (iv)  in O(1) for scalability
> >   (resulting in fix-up rsk_listener approach)
> > 
> > I selected the last socket in unhash path to satisfy above four because the
> > last socket changes at every close() syscall if application closes from
> > older socket.
> > 
> > But in receiving ACK or retransmitting SYN+ACK, we cannot get the last
> > 'moved' socket. Even if we reserve the last 'moved' socket in the last
> > index by the idea above, we cannot sure the last socket is changed after
> > close() for each req->listener. For example, we have listeners A, B, C, and
> > D, and then call close(A) and close(B), and receive the final ACKs for A
> > and B, then both of them are assigned to C. In this case, A for D and B for
> > C is desired. So, selecting the last socket in socks[] for incoming
> > requests cannnot realize (ii).
> > 
> > This is why I selected the last moved socket in unhash path and a random
> > listener in receive path.
> > 
> > 
> > > for kernel-pick, it should just go for simplicity and stay with
> > > the random(/hash) pick instead of pretending the kernel knows the
> > > application must operate in a certain way.  It is fine
> > > that the pick was wrong, the kernel will eventually move the
> > > childs/reqs to the survived listen sk.
> > 
> > Exactly. Also the heuristic way is not fair for every application.
> > 
> > After reading below idea (migrated_sk), I think random-pick is better
> > at simplicity and passing each sk.
> > 
> > 
> > > [ I still think the kernel should not even pick if
> > >   there is no bpf prog to instruct how to pick
> > >   but I am fine as long as there is a sysctl to
> > >   guard this. ]
> > 
> > Unless different applications listen on the same port, random-pick can save
> > connections which would 

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-09 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> This patch renames reuseport_select_sock() to __reuseport_select_sock() and
> adds two wrapper function of it to pass the migration type defined in the
> previous commit.
> 
>   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
>   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> 
> As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
> requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
> patch also changes the code to call reuseport_select_migrated_sock() even
> if the listening socket is TCP_CLOSE. If we can pick out a listening socket
> from the reuseport group, we rewrite request_sock.rsk_listener and resume
> processing the request.
> 
> Reviewed-by: Benjamin Herrenschmidt 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  include/net/inet_connection_sock.h | 12 +++
>  include/net/request_sock.h | 13 
>  include/net/sock_reuseport.h   |  8 +++
>  net/core/sock_reuseport.c  | 34 --
>  net/ipv4/inet_connection_sock.c| 13 ++--
>  net/ipv4/tcp_ipv4.c|  9 ++--
>  net/ipv6/tcp_ipv6.c|  9 ++--
>  7 files changed, 81 insertions(+), 17 deletions(-)
> 
> diff --git a/include/net/inet_connection_sock.h 
> b/include/net/inet_connection_sock.h
> index 2ea2d743f8fc..1e0958f5eb21 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct 
> sock *sk)
>   reqsk_queue_added(_csk(sk)->icsk_accept_queue);
>  }
>  
> +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> +  struct sock *nsk,
> +  struct request_sock *req)
> +{
> + reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> +  _csk(nsk)->icsk_accept_queue,
> +  req);
> + sock_put(sk);
not sure if it is safe to do here.
IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
to req->rsk_listener such that sock_hold(req->rsk_listener) is
safe because its sk_refcnt is not zero.

> + sock_hold(nsk);
> + req->rsk_listener = nsk;
> +}
> +

[ ... ]

> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 361efe55b1ad..e71653c6eae2 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -743,8 +743,17 @@ static void reqsk_timer_handler(struct timer_list *t)
>   struct request_sock_queue *queue = >icsk_accept_queue;
>   int max_syn_ack_retries, qlen, expire = 0, resend = 0;
>  
> - if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
> - goto drop;
> + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
> + sk_listener = reuseport_select_migrated_sock(sk_listener,
> +  
> req_to_sk(req)->sk_hash, NULL);
> + if (!sk_listener) {
> + sk_listener = req->rsk_listener;
> + goto drop;
> + }
> + inet_csk_reqsk_queue_migrated(req->rsk_listener, sk_listener, 
> req);
> + icsk = inet_csk(sk_listener);
> + queue = >icsk_accept_queue;
> + }
>  
>   max_syn_ack_retries = icsk->icsk_syn_retries ? : 
> net->ipv4.sysctl_tcp_synack_retries;
>   /* Normally all the openreqs are young and become mature
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index e4b31e70bd30..9a9aa27c6069 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1973,8 +1973,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
>   goto csum_error;
>   }
>   if (unlikely(sk->sk_state != TCP_LISTEN)) {
> - inet_csk_reqsk_queue_drop_and_put(sk, req);
> - goto lookup;
> + nsk = reuseport_select_migrated_sock(sk, 
> req_to_sk(req)->sk_hash, skb);
> + if (!nsk) {
> + inet_csk_reqsk_queue_drop_and_put(sk, req);
> + goto lookup;
> + }
> + inet_csk_reqsk_queue_migrated(sk, nsk, req);
> + sk = nsk;
>   }
>   /* We own a reference on the listener, increase it again
>* as we might lose it too soon.
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 992cbf3eb9e3..ff11f3c0cb96 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1635,8 +1635,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
> *skb)
>   goto csum_error;
>   }
>   if (unlikely(sk->sk_state != TCP_LISTEN)) {
> - inet_csk_reqsk_queue_drop_and_put(sk, req);
> -

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-08 Thread Martin KaFai Lau
On Tue, Dec 08, 2020 at 05:17:48PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Mon, 7 Dec 2020 23:34:41 -0800
> > On Tue, Dec 08, 2020 at 03:31:34PM +0900, Kuniyuki Iwashima wrote:
> > > From:   Martin KaFai Lau 
> > > Date:   Mon, 7 Dec 2020 12:33:15 -0800
> > > > On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > > > > From:   Eric Dumazet 
> > > > > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > > > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > > > > This patch lets reuseport_detach_sock() return a pointer of 
> > > > > > > struct sock,
> > > > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > > > inet_csk_reqsk_queue_migrate() migrates 
> > > > > > > TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > > > sockets from the closing listener to the selected one.
> > > > > > > 
> > > > > > > Listening sockets hold incoming connections as a linked list of 
> > > > > > > struct
> > > > > > > request_sock in the accept queue, and each request has reference 
> > > > > > > to a full
> > > > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we 
> > > > > > > only unlink
> > > > > > > the requests from the closing listener's queue and relink them to 
> > > > > > > the head
> > > > > > > of the new listener's queue. We do not process each request and 
> > > > > > > its
> > > > > > > reference to the listener, so the migration completes in O(1) time
> > > > > > > complexity. However, in the case of TCP_SYN_RECV sockets, we take 
> > > > > > > special
> > > > > > > care in the next commit.
> > > > > > > 
> > > > > > > By default, the kernel selects a new listener randomly. In order 
> > > > > > > to pick
> > > > > > > out a different socket every time, we select the last element of 
> > > > > > > socks[] as
> > > > > > > the new listener. This behaviour is based on how the kernel moves 
> > > > > > > sockets
> > > > > > > in socks[]. (See also [1])
> > > > > > > 
> > > > > > > Basically, in order to redistribute sockets evenly, we have to 
> > > > > > > use an eBPF
> > > > > > > program called in the later commit, but as the side effect of 
> > > > > > > such default
> > > > > > > selection, the kernel can redistribute old requests evenly to new 
> > > > > > > listeners
> > > > > > > for a specific case where the application replaces listeners by
> > > > > > > generations.
> > > > > > > 
> > > > > > > For example, we call listen() for four sockets (A, B, C, D), and 
> > > > > > > close the
> > > > > > > first two by turns. The sockets move in socks[] like below.
> > > > > > > 
> > > > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > > > >   socks[2] : C   |  socks[2] : C --'
> > > > > > >   socks[3] : D --'
> > > > > > > 
> > > > > > > Then, if C and D have newer settings than A and B, and each 
> > > > > > > socket has a
> > > > > > > request (a, b, c, d) in their accept queue, we can redistribute 
> > > > > > > old
> > > > > > > requests evenly to new listeners.
> > > > > > > 
> > > > > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : 
> > > > > > > D (a + d)
> > > > > > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : 
> > > > > > > C (b + c)
> > > > > > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > > > > > >   socks[3] : D (d) --'
> > > > > > > 
> > > > > > > Here, (A, D) or (B, C) can have different application settings, 
> > > > > > > but they
> > > 

Re: [PATCH bpf-next v3] bpf: Only provide bpf_sock_from_file with CONFIG_NET

2020-12-08 Thread Martin KaFai Lau
On Tue, Dec 08, 2020 at 06:36:23PM +0100, Florent Revest wrote:
> This moves the bpf_sock_from_file definition into net/core/filter.c
> which only gets compiled with CONFIG_NET and also moves the helper proto
> usage next to other tracing helpers that are conditional on CONFIG_NET.
> 
> This avoids
>   ld: kernel/trace/bpf_trace.o: in function `bpf_sock_from_file':
>   bpf_trace.c:(.text+0xe23): undefined reference to `sock_from_file'
> When compiling a kernel with BPF and without NET.
Acked-by: Martin KaFai Lau 


Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-08 Thread Martin KaFai Lau
On Tue, Dec 08, 2020 at 03:27:14PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Mon, 7 Dec 2020 12:14:38 -0800
> > On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote:
> > > From:   Martin KaFai Lau 
> > > Date:   Fri, 4 Dec 2020 17:42:41 -0800
> > > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> > > > [ ... ]
> > > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > > > > index fd133516ac0e..60d7c1f28809 100644
> > > > > --- a/net/core/sock_reuseport.c
> > > > > +++ b/net/core/sock_reuseport.c
> > > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct 
> > > > > sock *sk2, bool bind_inany)
> > > > >  }
> > > > >  EXPORT_SYMBOL(reuseport_add_sock);
> > > > >  
> > > > > -void reuseport_detach_sock(struct sock *sk)
> > > > > +struct sock *reuseport_detach_sock(struct sock *sk)
> > > > >  {
> > > > >   struct sock_reuseport *reuse;
> > > > > + struct bpf_prog *prog;
> > > > > + struct sock *nsk = NULL;
> > > > >   int i;
> > > > >  
> > > > >   spin_lock_bh(_lock);
> > > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> > > > >  
> > > > >   reuse->num_socks--;
> > > > >   reuse->socks[i] = reuse->socks[reuse->num_socks];
> > > > > + prog = rcu_dereference(reuse->prog);
> > > > Is it under rcu_read_lock() here?
> > > 
> > > reuseport_lock is locked in this function, and we do not modify the prog,
> > > but is rcu_dereference_protected() preferable?
> > > 
> > > ---8<---
> > > prog = rcu_dereference_protected(reuse->prog,
> > >lockdep_is_held(_lock));
> > > ---8<---
> > It is not only reuse->prog.  Other things also require rcu_read_lock(),
> > e.g. please take a look at __htab_map_lookup_elem().
> > 
> > The TCP_LISTEN sk (selected by bpf to be the target of the migration)
> > is also protected by rcu.
> 
> Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset.
> 
> 
> > I am surprised there is no WARNING in the test.
> > Do you have the needed DEBUG_LOCK* config enabled?
> 
> Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock()
> does not show warnings...
I would at least expect the "WARN_ON_ONCE(!rcu_read_lock_held() ...)"
from __htab_map_lookup_elem() should fire in your test
example in the last patch.

It is better to check the config before sending v3.

[ ... ]

> > > > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > > > b/net/ipv4/inet_connection_sock.c
> > > > > index 1451aa9712b0..b27241ea96bd 100644
> > > > > --- a/net/ipv4/inet_connection_sock.c
> > > > > +++ b/net/ipv4/inet_connection_sock.c
> > > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct 
> > > > > sock *sk,
> > > > >  }
> > > > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > > > >  
> > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
> > > > > +{
> > > > > + struct request_sock_queue *old_accept_queue, *new_accept_queue;
> > > > > +
> > > > > + old_accept_queue = _csk(sk)->icsk_accept_queue;
> > > > > + new_accept_queue = _csk(nsk)->icsk_accept_queue;
> > > > > +
> > > > > + spin_lock(_accept_queue->rskq_lock);
> > > > > + spin_lock(_accept_queue->rskq_lock);
> > > > I am also not very thrilled on this double spin_lock.
> > > > Can this be done in (or like) inet_csk_listen_stop() instead?
> > > 
> > > It will be possible to migrate sockets in inet_csk_listen_stop(), but I
> > > think it is better to do it just after reuseport_detach_sock() becuase we
> > > can select a different listener (almost) every time at a lower cost by
> > > selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate()
> > > easily.
> > I don't see the "lower cost" point.  Please elaborate.
> 
> In reuseport_select_sock(), we pass sk_hash of the request socket to
> reciprocal_scale() and generate a random index for socks[] to select
> a different listener every tim

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-07 Thread Martin KaFai Lau
On Tue, Dec 08, 2020 at 03:31:34PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Mon, 7 Dec 2020 12:33:15 -0800
> > On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > > From:   Eric Dumazet 
> > > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > > This patch lets reuseport_detach_sock() return a pointer of struct 
> > > > > sock,
> > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > sockets from the closing listener to the selected one.
> > > > > 
> > > > > Listening sockets hold incoming connections as a linked list of struct
> > > > > request_sock in the accept queue, and each request has reference to a 
> > > > > full
> > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only 
> > > > > unlink
> > > > > the requests from the closing listener's queue and relink them to the 
> > > > > head
> > > > > of the new listener's queue. We do not process each request and its
> > > > > reference to the listener, so the migration completes in O(1) time
> > > > > complexity. However, in the case of TCP_SYN_RECV sockets, we take 
> > > > > special
> > > > > care in the next commit.
> > > > > 
> > > > > By default, the kernel selects a new listener randomly. In order to 
> > > > > pick
> > > > > out a different socket every time, we select the last element of 
> > > > > socks[] as
> > > > > the new listener. This behaviour is based on how the kernel moves 
> > > > > sockets
> > > > > in socks[]. (See also [1])
> > > > > 
> > > > > Basically, in order to redistribute sockets evenly, we have to use an 
> > > > > eBPF
> > > > > program called in the later commit, but as the side effect of such 
> > > > > default
> > > > > selection, the kernel can redistribute old requests evenly to new 
> > > > > listeners
> > > > > for a specific case where the application replaces listeners by
> > > > > generations.
> > > > > 
> > > > > For example, we call listen() for four sockets (A, B, C, D), and 
> > > > > close the
> > > > > first two by turns. The sockets move in socks[] like below.
> > > > > 
> > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > >   socks[2] : C   |  socks[2] : C --'
> > > > >   socks[3] : D --'
> > > > > 
> > > > > Then, if C and D have newer settings than A and B, and each socket 
> > > > > has a
> > > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > > requests evenly to new listeners.
> > > > > 
> > > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a 
> > > > > + d)
> > > > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b 
> > > > > + c)
> > > > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > > > >   socks[3] : D (d) --'
> > > > > 
> > > > > Here, (A, D) or (B, C) can have different application settings, but 
> > > > > they
> > > > > MUST have the same settings at the socket API level; otherwise, 
> > > > > unexpected
> > > > > error may happen. For instance, if only the new listeners have
> > > > > TCP_SAVE_SYN, old requests do not have SYN data, so the application 
> > > > > will
> > > > > face inconsistency and cause an error.
> > > > > 
> > > > > Therefore, if there are different kinds of sockets, we must attach an 
> > > > > eBPF
> > > > > program described in later commits.
> > > > > 
> > > > > Link: 
> > > > > https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
> > > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > > Signed-off-by: Kuniyuki Iwashima 
> > > > > ---
> > > > 

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-07 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:

> @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
>  
>   reuse->num_socks--;
>   reuse->socks[i] = reuse->socks[reuse->num_socks];
> + prog = rcu_dereference(reuse->prog);
>  
>   if (sk->sk_protocol == IPPROTO_TCP) {
> + if (reuse->num_socks && !prog)
> + nsk = i == reuse->num_socks ? reuse->socks[i - 
> 1] : reuse->socks[i];
I asked in the earlier thread if the primary use case is to only
use the bpf prog to pick.  That thread did not come to
a solid answer but did conclude that the sysctl should not
control the behavior of the BPF_SK_REUSEPORT_SELECT_OR_MIGRATE prog.

>From this change here, it seems it is still desired to only depend
on the kernel to random pick even when no bpf prog is attached.
If that is the case, a sysctl to guard here for not changing
the current behavior makes sense.
It should still only control the non-bpf-pick behavior:
when the sysctl is on, the kernel will still do a random pick
when there is no bpf prog attached to the reuseport group.
Thoughts?

> +
>   reuse->num_closed_socks++;
>   reuse->socks[reuse->max_socks - 
> reuse->num_closed_socks] = sk;
>   } else {
> @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
>   call_rcu(>rcu, reuseport_free_rcu);
>  out:
>   spin_unlock_bh(_lock);
> +
> + return nsk;
>  }
>  EXPORT_SYMBOL(reuseport_detach_sock);



Re: [PATCH bpf-next v2] bpf: Only call sock_from_file with CONFIG_NET

2020-12-07 Thread Martin KaFai Lau
On Mon, Dec 07, 2020 at 09:06:05PM +0100, Florent Revest wrote:
> This avoids
>   ld: kernel/trace/bpf_trace.o: in function `bpf_sock_from_file':
>   bpf_trace.c:(.text+0xe23): undefined reference to `sock_from_file'
> When compiling a kernel with BPF and without NET.
> 
> Reported-by: Randy Dunlap 
> Signed-off-by: Florent Revest 
> ---
>  kernel/trace/bpf_trace.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 0cf0a6331482..29ec2b3b1cc4 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1272,7 +1272,11 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
>  
>  BPF_CALL_1(bpf_sock_from_file, struct file *, file)
>  {
> +#ifdef CONFIG_NET
>   return (unsigned long) sock_from_file(file);
> +#else
> + return 0;
> +#endif
>  }
Should bpf_sock_from_file_proto belong to
tracing_prog_func_proto() instead of bpf_tracing_func_proto()?
bpf_skc_to_*_proto is also in tracing_prog_func_proto()
where there is an existing "#ifdef CONFIG_NET".

>  
>  BTF_ID_LIST(bpf_sock_from_file_btf_ids)
> -- 
> 2.29.2.576.ga3fc446d84-goog
> 


Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-07 Thread Martin KaFai Lau
On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> From:   Eric Dumazet 
> Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > > which is used only by inet_unhash(). If it is not NULL,
> > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > sockets from the closing listener to the selected one.
> > > 
> > > Listening sockets hold incoming connections as a linked list of struct
> > > request_sock in the accept queue, and each request has reference to a full
> > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only unlink
> > > the requests from the closing listener's queue and relink them to the head
> > > of the new listener's queue. We do not process each request and its
> > > reference to the listener, so the migration completes in O(1) time
> > > complexity. However, in the case of TCP_SYN_RECV sockets, we take special
> > > care in the next commit.
> > > 
> > > By default, the kernel selects a new listener randomly. In order to pick
> > > out a different socket every time, we select the last element of socks[] 
> > > as
> > > the new listener. This behaviour is based on how the kernel moves sockets
> > > in socks[]. (See also [1])
> > > 
> > > Basically, in order to redistribute sockets evenly, we have to use an eBPF
> > > program called in the later commit, but as the side effect of such default
> > > selection, the kernel can redistribute old requests evenly to new 
> > > listeners
> > > for a specific case where the application replaces listeners by
> > > generations.
> > > 
> > > For example, we call listen() for four sockets (A, B, C, D), and close the
> > > first two by turns. The sockets move in socks[] like below.
> > > 
> > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > >   socks[2] : C   |  socks[2] : C --'
> > >   socks[3] : D --'
> > > 
> > > Then, if C and D have newer settings than A and B, and each socket has a
> > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > requests evenly to new listeners.
> > > 
> > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
> > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
> > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > >   socks[3] : D (d) --'
> > > 
> > > Here, (A, D) or (B, C) can have different application settings, but they
> > > MUST have the same settings at the socket API level; otherwise, unexpected
> > > error may happen. For instance, if only the new listeners have
> > > TCP_SAVE_SYN, old requests do not have SYN data, so the application will
> > > face inconsistency and cause an error.
> > > 
> > > Therefore, if there are different kinds of sockets, we must attach an eBPF
> > > program described in later commits.
> > > 
> > > Link: 
> > > https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
> > > Reviewed-by: Benjamin Herrenschmidt 
> > > Signed-off-by: Kuniyuki Iwashima 
> > > ---
> > >  include/net/inet_connection_sock.h |  1 +
> > >  include/net/sock_reuseport.h   |  2 +-
> > >  net/core/sock_reuseport.c  | 10 +-
> > >  net/ipv4/inet_connection_sock.c| 30 ++
> > >  net/ipv4/inet_hashtables.c |  9 +++--
> > >  5 files changed, 48 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/include/net/inet_connection_sock.h 
> > > b/include/net/inet_connection_sock.h
> > > index 7338b3865a2a..2ea2d743f8fc 100644
> > > --- a/include/net/inet_connection_sock.h
> > > +++ b/include/net/inet_connection_sock.h
> > > @@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const 
> > > struct sock *sk,
> > >  struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
> > > struct request_sock *req,
> > > struct sock *child);
> > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
> > >  void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock 
> > > *req,
> > >  unsigned long timeout);
> > >  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock 
> > > *child,
> > > diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
> > > index 0e558ca7afbf..09a1b1539d4c 100644
> > > --- a/include/net/sock_reuseport.h
> > > +++ b/include/net/sock_reuseport.h
> > > @@ -31,7 +31,7 @@ struct sock_reuseport {
> > >  extern int reuseport_alloc(struct sock *sk, bool bind_inany);
> > >  extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
> > > bool bind_inany);
> > > -extern void reuseport_detach_sock(struct sock *sk);
> > > +extern struct sock *reuseport_detach_sock(struct sock *sk);
> > >  extern struct sock 

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-07 Thread Martin KaFai Lau
On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Fri, 4 Dec 2020 17:42:41 -0800
> > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> > [ ... ]
> > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > > index fd133516ac0e..60d7c1f28809 100644
> > > --- a/net/core/sock_reuseport.c
> > > +++ b/net/core/sock_reuseport.c
> > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock 
> > > *sk2, bool bind_inany)
> > >  }
> > >  EXPORT_SYMBOL(reuseport_add_sock);
> > >  
> > > -void reuseport_detach_sock(struct sock *sk)
> > > +struct sock *reuseport_detach_sock(struct sock *sk)
> > >  {
> > >   struct sock_reuseport *reuse;
> > > + struct bpf_prog *prog;
> > > + struct sock *nsk = NULL;
> > >   int i;
> > >  
> > >   spin_lock_bh(_lock);
> > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> > >  
> > >   reuse->num_socks--;
> > >   reuse->socks[i] = reuse->socks[reuse->num_socks];
> > > + prog = rcu_dereference(reuse->prog);
> > Is it under rcu_read_lock() here?
> 
> reuseport_lock is locked in this function, and we do not modify the prog,
> but is rcu_dereference_protected() preferable?
> 
> ---8<---
> prog = rcu_dereference_protected(reuse->prog,
>lockdep_is_held(_lock));
> ---8<---
It is not only reuse->prog.  Other things also require rcu_read_lock(),
e.g. please take a look at __htab_map_lookup_elem().

The TCP_LISTEN sk (selected by bpf to be the target of the migration)
is also protected by rcu.

I am surprised there is no WARNING in the test.
Do you have the needed DEBUG_LOCK* config enabled?

> > >   if (sk->sk_protocol == IPPROTO_TCP) {
> > > + if (reuse->num_socks && !prog)
> > > + nsk = i == reuse->num_socks ? reuse->socks[i - 
> > > 1] : reuse->socks[i];
> > > +
> > >   reuse->num_closed_socks++;
> > >   reuse->socks[reuse->max_socks - 
> > > reuse->num_closed_socks] = sk;
> > >   } else {
> > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
> > >   call_rcu(>rcu, reuseport_free_rcu);
> > >  out:
> > >   spin_unlock_bh(_lock);
> > > +
> > > + return nsk;
> > >  }
> > >  EXPORT_SYMBOL(reuseport_detach_sock);
> > >  
> > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > b/net/ipv4/inet_connection_sock.c
> > > index 1451aa9712b0..b27241ea96bd 100644
> > > --- a/net/ipv4/inet_connection_sock.c
> > > +++ b/net/ipv4/inet_connection_sock.c
> > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock 
> > > *sk,
> > >  }
> > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > >  
> > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
> > > +{
> > > + struct request_sock_queue *old_accept_queue, *new_accept_queue;
> > > +
> > > + old_accept_queue = _csk(sk)->icsk_accept_queue;
> > > + new_accept_queue = _csk(nsk)->icsk_accept_queue;
> > > +
> > > + spin_lock(_accept_queue->rskq_lock);
> > > + spin_lock(_accept_queue->rskq_lock);
> > I am also not very thrilled on this double spin_lock.
> > Can this be done in (or like) inet_csk_listen_stop() instead?
> 
> It will be possible to migrate sockets in inet_csk_listen_stop(), but I
> think it is better to do it just after reuseport_detach_sock() becuase we
> can select a different listener (almost) every time at a lower cost by
> selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate()
> easily.
I don't see the "lower cost" point.  Please elaborate.

> 
> sk_hash of the listener is 0, so we would have to generate a random number
> in inet_csk_listen_stop().
If I read it correctly, it is also passing 0 as the sk_hash to
bpf_run_sk_reuseport() from reuseport_detach_sock().

Also, how is the sk_hash expected to be used?  I don't see
it in the test.


Re: [PATCH v1 bpf-next 11/11] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

2020-12-04 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:18PM +0900, Kuniyuki Iwashima wrote:
> This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> 
> Reviewed-by: Benjamin Herrenschmidt 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  .../bpf/prog_tests/migrate_reuseport.c| 164 ++
>  .../bpf/progs/test_migrate_reuseport_kern.c   |  54 ++
>  2 files changed, 218 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
>  create mode 100644 
> tools/testing/selftests/bpf/progs/test_migrate_reuseport_kern.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c 
> b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
> new file mode 100644
> index ..87c72d9ccadd
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
> @@ -0,0 +1,164 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Check if we can migrate child sockets.
> + *
> + *   1. call listen() for 5 server sockets.
> + *   2. update a map to migrate all child socket
> + *to the last server socket (migrate_map[cookie] = 4)
> + *   3. call connect() for 25 client sockets.
> + *   4. call close() for first 4 server sockets.
> + *   5. call accept() for the last server socket.
> + *
> + * Author: Kuniyuki Iwashima 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define NUM_SOCKS 5
> +#define LOCALHOST "127.0.0.1"
> +#define err_exit(condition, message)   \
> + do {  \
> + if (condition) {  \
> + perror("ERROR: " message " ");\
> + exit(1);  \
> + } \
> + } while (0)
> +
> +__u64 server_fds[NUM_SOCKS];
> +int prog_fd, reuseport_map_fd, migrate_map_fd;
> +
> +
> +void setup_bpf(void)
> +{
> + struct bpf_object *obj;
> + struct bpf_program *prog;
> + struct bpf_map *reuseport_map, *migrate_map;
> + int err;
> +
> + obj = bpf_object__open("test_migrate_reuseport_kern.o");
> + err_exit(libbpf_get_error(obj), "opening BPF object file failed");
> +
> + err = bpf_object__load(obj);
> + err_exit(err, "loading BPF object failed");
> +
> + prog = bpf_program__next(NULL, obj);
> + err_exit(!prog, "loading BPF program failed");
> +
> + reuseport_map = bpf_object__find_map_by_name(obj, "reuseport_map");
> + err_exit(!reuseport_map, "loading BPF reuseport_map failed");
> +
> + migrate_map = bpf_object__find_map_by_name(obj, "migrate_map");
> + err_exit(!migrate_map, "loading BPF migrate_map failed");
> +
> + prog_fd = bpf_program__fd(prog);
> + reuseport_map_fd = bpf_map__fd(reuseport_map);
> + migrate_map_fd = bpf_map__fd(migrate_map);
> +}
> +
> +void test_listen(void)
> +{
> + struct sockaddr_in addr;
> + socklen_t addr_len = sizeof(addr);
> + int i, err, optval = 1, migrated_to = NUM_SOCKS - 1;
> + __u64 value;
> +
> + addr.sin_family = AF_INET;
> + addr.sin_port = htons(80);
> + inet_pton(AF_INET, LOCALHOST, _addr.s_addr);
> +
> + for (i = 0; i < NUM_SOCKS; i++) {
> + server_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> + err_exit(server_fds[i] == -1, "socket() for listener sockets 
> failed");
> +
> + err = setsockopt(server_fds[i], SOL_SOCKET, SO_REUSEPORT,
> +  , sizeof(optval));
> + err_exit(err == -1, "setsockopt() for SO_REUSEPORT failed");
> +
> + if (i == 0) {
> + err = setsockopt(server_fds[i], SOL_SOCKET, 
> SO_ATTACH_REUSEPORT_EBPF,
> +  _fd, sizeof(prog_fd));
> + err_exit(err == -1, "setsockopt() for 
> SO_ATTACH_REUSEPORT_EBPF failed");
> + }
> +
> + err = bind(server_fds[i], (struct sockaddr *), addr_len);
> + err_exit(err == -1, "bind() failed");
> +
> + err = listen(server_fds[i], 32);
> + err_exit(err == -1, "listen() failed");
> +
> + err = bpf_map_update_elem(reuseport_map_fd, , _fds[i], 
> BPF_NOEXIST);
> + err_exit(err == -1, "updating BPF reuseport_map failed");
> +
> + err = bpf_map_lookup_elem(reuseport_map_fd, , );
> + err_exit(err == -1, "looking up BPF reuseport_map failed");
> +
> + printf("fd[%d] (cookie: %llu) -> fd[%d]\n", i, value, 
> migrated_to);
> + err = bpf_map_update_elem(migrate_map_fd, , _to, 
> BPF_NOEXIST);
> + err_exit(err == -1, "updating BPF migrate_map failed");
> + }
> +}
> +
> +void test_connect(void)
> +{
> + struct sockaddr_in addr;
> + socklen_t addr_len = sizeof(addr);
> + int i, err, client_fd;
> +
> + 

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-04 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
[ ... ]
> diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> index fd133516ac0e..60d7c1f28809 100644
> --- a/net/core/sock_reuseport.c
> +++ b/net/core/sock_reuseport.c
> @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock 
> *sk2, bool bind_inany)
>  }
>  EXPORT_SYMBOL(reuseport_add_sock);
>  
> -void reuseport_detach_sock(struct sock *sk)
> +struct sock *reuseport_detach_sock(struct sock *sk)
>  {
>   struct sock_reuseport *reuse;
> + struct bpf_prog *prog;
> + struct sock *nsk = NULL;
>   int i;
>  
>   spin_lock_bh(_lock);
> @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
>  
>   reuse->num_socks--;
>   reuse->socks[i] = reuse->socks[reuse->num_socks];
> + prog = rcu_dereference(reuse->prog);
Is it under rcu_read_lock() here?

>  
>   if (sk->sk_protocol == IPPROTO_TCP) {
> + if (reuse->num_socks && !prog)
> + nsk = i == reuse->num_socks ? reuse->socks[i - 
> 1] : reuse->socks[i];
> +
>   reuse->num_closed_socks++;
>   reuse->socks[reuse->max_socks - 
> reuse->num_closed_socks] = sk;
>   } else {
> @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
>   call_rcu(>rcu, reuseport_free_rcu);
>  out:
>   spin_unlock_bh(_lock);
> +
> + return nsk;
>  }
>  EXPORT_SYMBOL(reuseport_detach_sock);
>  
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 1451aa9712b0..b27241ea96bd 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
>  }
>  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
>  
> +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
> +{
> + struct request_sock_queue *old_accept_queue, *new_accept_queue;
> +
> + old_accept_queue = _csk(sk)->icsk_accept_queue;
> + new_accept_queue = _csk(nsk)->icsk_accept_queue;
> +
> + spin_lock(_accept_queue->rskq_lock);
> + spin_lock(_accept_queue->rskq_lock);
I am also not very thrilled on this double spin_lock.
Can this be done in (or like) inet_csk_listen_stop() instead?

> +
> + if (old_accept_queue->rskq_accept_head) {
> + if (new_accept_queue->rskq_accept_head)
> + old_accept_queue->rskq_accept_tail->dl_next =
> + new_accept_queue->rskq_accept_head;
> + else
> + new_accept_queue->rskq_accept_tail = 
> old_accept_queue->rskq_accept_tail;
> +
> + new_accept_queue->rskq_accept_head = 
> old_accept_queue->rskq_accept_head;
> + old_accept_queue->rskq_accept_head = NULL;
> + old_accept_queue->rskq_accept_tail = NULL;
> +
> + WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + 
> sk->sk_ack_backlog);
> + WRITE_ONCE(sk->sk_ack_backlog, 0);
> + }
> +
> + spin_unlock(_accept_queue->rskq_lock);
> + spin_unlock(_accept_queue->rskq_lock);
> +}
> +EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
> +
>  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
>struct request_sock *req, bool own_req)
>  {
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 45fb450b4522..545538a6bfac 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -681,6 +681,7 @@ void inet_unhash(struct sock *sk)
>  {
>   struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
>   struct inet_listen_hashbucket *ilb = NULL;
> + struct sock *nsk;
>   spinlock_t *lock;
>  
>   if (sk_unhashed(sk))
> @@ -696,8 +697,12 @@ void inet_unhash(struct sock *sk)
>   if (sk_unhashed(sk))
>   goto unlock;
>  
> - if (rcu_access_pointer(sk->sk_reuseport_cb))
> - reuseport_detach_sock(sk);
> + if (rcu_access_pointer(sk->sk_reuseport_cb)) {
> + nsk = reuseport_detach_sock(sk);
> + if (nsk)
> + inet_csk_reqsk_queue_migrate(sk, nsk);
> + }
> +
>   if (ilb) {
>   inet_unhash2(hashinfo, sk);
>   ilb->count--;
> -- 
> 2.17.2 (Apple Git-113)
> 


Re: [PATCH v1 bpf-next 01/11] tcp: Keep TCP_CLOSE sockets in the reuseport group.

2020-12-04 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:08PM +0900, Kuniyuki Iwashima wrote:
> This patch is a preparation patch to migrate incoming connections in the
> later commits and adds a field (num_closed_socks) to the struct
> sock_reuseport to keep TCP_CLOSE sockets in the reuseport group.
> 
> When we close a listening socket, to migrate its connections to another
> listener in the same reuseport group, we have to handle two kinds of child
> sockets. One is that a listening socket has a reference to, and the other
> is not.
> 
> The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
> accept queue of their listening socket. So, we can pop them out and push
> them into another listener's queue at close() or shutdown() syscalls. On
> the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
> three-way handshake and not in the accept queue. Thus, we cannot access
> such sockets at close() or shutdown() syscalls. Accordingly, we have to
> migrate immature sockets after their listening socket has been closed.
> 
> Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
> sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
> that time, if we could select a new listener from the same reuseport group,
> no connection would be aborted. However, it is impossible because
> reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
> the reuseport group from closed sockets.
> 
> This patch allows TCP_CLOSE sockets to remain in the reuseport group and to
> have access to it while any child socket references to them. The point is
> that reuseport_detach_sock() is called twice from inet_unhash() and
> sk_destruct(). At first, it moves the socket backwards in socks[] and
> increments num_closed_socks. Later, when all migrated connections are
> accepted, it removes the socket from socks[], decrements num_closed_socks,
> and sets NULL to sk_reuseport_cb.
> 
> By this change, closed sockets can keep sk_reuseport_cb until all child
> requests have been freed or accepted. Consequently calling listen() after
> shutdown() can cause EADDRINUSE or EBUSY in reuseport_add_sock() or
> inet_csk_bind_conflict() which expect that such sockets should not have the
> reuseport group. Therefore, this patch also loosens such validation rules
> so that the socket can listen again if it has the same reuseport group with
> other listening sockets.
> 
> Reviewed-by: Benjamin Herrenschmidt 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  include/net/sock_reuseport.h|  5 ++-
>  net/core/sock_reuseport.c   | 79 +++--
>  net/ipv4/inet_connection_sock.c |  7 ++-
>  3 files changed, 74 insertions(+), 17 deletions(-)
> 
> diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
> index 505f1e18e9bf..0e558ca7afbf 100644
> --- a/include/net/sock_reuseport.h
> +++ b/include/net/sock_reuseport.h
> @@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
>  struct sock_reuseport {
>   struct rcu_head rcu;
>  
> - u16 max_socks;  /* length of socks */
> - u16 num_socks;  /* elements in socks */
> + u16 max_socks;  /* length of socks */
> + u16 num_socks;  /* elements in socks */
> + u16 num_closed_socks;   /* closed elements in 
> socks */
>   /* The last synq overflow event timestamp of this
>* reuse->socks[] group.
>*/
> diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> index bbdd3c7b6cb5..fd133516ac0e 100644
> --- a/net/core/sock_reuseport.c
> +++ b/net/core/sock_reuseport.c
> @@ -98,16 +98,21 @@ static struct sock_reuseport *reuseport_grow(struct 
> sock_reuseport *reuse)
>   return NULL;
>  
>   more_reuse->num_socks = reuse->num_socks;
> + more_reuse->num_closed_socks = reuse->num_closed_socks;
>   more_reuse->prog = reuse->prog;
>   more_reuse->reuseport_id = reuse->reuseport_id;
>   more_reuse->bind_inany = reuse->bind_inany;
>   more_reuse->has_conns = reuse->has_conns;
> + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
>  
>   memcpy(more_reuse->socks, reuse->socks,
>  reuse->num_socks * sizeof(struct sock *));
> - more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
> + memcpy(more_reuse->socks +
> +(more_reuse->max_socks - more_reuse->num_closed_socks),
> +reuse->socks + reuse->num_socks,
> +reuse->num_closed_socks * sizeof(struct sock *));
>  
> - for (i = 0; i < reuse->num_socks; ++i)
> + for (i = 0; i < reuse->max_socks; ++i)
>   rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
>  more_reuse);
>  
> @@ -129,6 +134,25 @@ static void reuseport_free_rcu(struct rcu_head *head)
>   kfree(reuse);
>  }
>  
> +static int 

Re: [PATCH v1 bpf-next 09/11] bpf: Support bpf_get_socket_cookie_sock() for BPF_PROG_TYPE_SK_REUSEPORT.

2020-12-04 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 11:44:16PM +0900, Kuniyuki Iwashima wrote:
> We will call sock_reuseport.prog for socket migration in the next commit,
> so the eBPF program has to know which listener is closing in order to
> select the new listener.
> 
> Currently, we can get a unique ID for each listener in the userspace by
> calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.
> 
> This patch makes the sk pointer available in sk_reuseport_md so that we can
> get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.
> 
> Link: 
> https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f...@kafai-mbp.dhcp.thefacebook.com/
> Suggested-by: Martin KaFai Lau 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  include/uapi/linux/bpf.h   |  8 
>  net/core/filter.c  | 12 +++-
>  tools/include/uapi/linux/bpf.h |  8 
>  3 files changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index efe342bf3dbc..3e9b8bd42b4e 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1650,6 +1650,13 @@ union bpf_attr {
>   *   A 8-byte long non-decreasing number on success, or 0 if the
>   *   socket field is missing inside *skb*.
>   *
> + * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
> + *   Description
> + *   Equivalent to bpf_get_socket_cookie() helper that accepts
> + *   *skb*, but gets socket from **struct bpf_sock** context.
> + *   Return
> + *   A 8-byte long non-decreasing number.
> + *
>   * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
>   *   Description
>   *   Equivalent to bpf_get_socket_cookie() helper that accepts
> @@ -4420,6 +4427,7 @@ struct sk_reuseport_md {
>   __u32 bind_inany;   /* Is sock bound to an INANY address? */
>   __u32 hash; /* A hash of the packet 4 tuples */
>   __u8 migration; /* Migration type */
> + __bpf_md_ptr(struct bpf_sock *, sk); /* current listening socket */
>  };
>  
>  #define BPF_TAG_SIZE 8
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 0a0634787bb4..1059d31847ef 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4628,7 +4628,7 @@ static const struct bpf_func_proto 
> bpf_get_socket_cookie_sock_proto = {
>   .func   = bpf_get_socket_cookie_sock,
>   .gpl_only   = false,
>   .ret_type   = RET_INTEGER,
> - .arg1_type  = ARG_PTR_TO_CTX,
> + .arg1_type  = ARG_PTR_TO_SOCKET,
This will break existing bpf prog (BPF_PROG_TYPE_CGROUP_SOCK)
using this proto.  A new proto is needed and there is
an on-going patch doing this [0].

[0]: https://lore.kernel.org/bpf/20201203213330.1657666-1-rev...@google.com/


Re: [PATCH bpf-next v2 1/3] bpf: Expose bpf_get_socket_cookie to tracing programs

2020-12-04 Thread Martin KaFai Lau
On Thu, Dec 03, 2020 at 10:33:28PM +0100, Florent Revest wrote:
> This creates a new helper proto because the existing
> bpf_get_socket_cookie_sock_proto has a ARG_PTR_TO_CTX argument and only
> works for BPF programs where the context is a sock.
> 
> This helper could also be useful to other BPF program types such as LSM.
> 
> Signed-off-by: Florent Revest 
> ---
>  include/uapi/linux/bpf.h   | 7 +++
>  kernel/trace/bpf_trace.c   | 4 
>  net/core/filter.c  | 7 +++
>  tools/include/uapi/linux/bpf.h | 7 +++
>  4 files changed, 25 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c3458ec1f30a..3e0e33c43998 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1662,6 +1662,13 @@ union bpf_attr {
>   *   Return
>   *   A 8-byte long non-decreasing number.
>   *
> + * u64 bpf_get_socket_cookie(void *sk)
> + *   Description
> + *   Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
> + *   *sk*, but gets socket from a BTF **struct sock**.
> + *   Return
> + *   A 8-byte long non-decreasing number.
> + *
>   * u32 bpf_get_socket_uid(struct sk_buff *skb)
>   *   Return
>   *   The owner UID of the socket associated to *skb*. If the socket
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index d255bc9b2bfa..14ad96579813 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1725,6 +1725,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>   }
>  }
>  
> +extern const struct bpf_func_proto bpf_get_socket_cookie_sock_tracing_proto;
> +
>  const struct bpf_func_proto *
>  tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog 
> *prog)
>  {
> @@ -1748,6 +1750,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>   return _sk_storage_get_tracing_proto;
>   case BPF_FUNC_sk_storage_delete:
>   return _sk_storage_delete_tracing_proto;
> + case BPF_FUNC_get_socket_cookie:
> + return _get_socket_cookie_sock_tracing_proto;
>  #endif
>   case BPF_FUNC_seq_printf:
>   return prog->expected_attach_type == BPF_TRACE_ITER ?
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 2ca5eecebacf..177c4e5e529d 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4631,6 +4631,13 @@ static const struct bpf_func_proto 
> bpf_get_socket_cookie_sock_proto = {
>   .arg1_type  = ARG_PTR_TO_CTX,
>  };
>  
> +const struct bpf_func_proto bpf_get_socket_cookie_sock_tracing_proto = {
> + .func   = bpf_get_socket_cookie_sock,
> + .gpl_only   = false,
> + .ret_type   = RET_INTEGER,
> + .arg1_type  = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
In tracing where it gets a sk pointer, the sk could be NULL.
A NULL check is required in the helper. Please refer to
bpf_skc_to_tcp_sock[_proto] as an example.

This proto is in general also useful for non tracing context where
it can get a hold of a sk pointer. (e.g. another similar usage that will
have a hold on a sk pointer for BPF_PROG_TYPE_SK_REUSEPORT [0]).
In case if you don't need sleepable at this point as Daniel
mentioned in another thread.  Does it make sense to rename this
proto to something like bpf_get_socket_pointer_cookie_proto?

[0]: https://lore.kernel.org/bpf/20201201144418.35045-10-kun...@amazon.co.jp/


Re: [PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

2020-12-03 Thread Martin KaFai Lau
On Thu, Dec 03, 2020 at 11:16:08PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Wed, 2 Dec 2020 20:24:02 -0800
> > On Wed, Dec 02, 2020 at 11:19:02AM -0800, Martin KaFai Lau wrote:
> > > On Tue, Dec 01, 2020 at 06:04:50PM -0800, Andrii Nakryiko wrote:
> > > > On Tue, Dec 1, 2020 at 6:49 AM Kuniyuki Iwashima  
> > > > wrote:
> > > > >
> > > > > This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
> > > > > check if the attached eBPF program is capable of migrating sockets.
> > > > >
> > > > > When the eBPF program is attached, the kernel runs it for socket 
> > > > > migration
> > > > > only if the expected_attach_type is 
> > > > > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > > > > The kernel will change the behaviour depending on the returned value:
> > > > >
> > > > >   - SK_PASS with selected_sk, select it as a new listener
> > > > >   - SK_PASS with selected_sk NULL, fall back to the random selection
> > > > >   - SK_DROP, cancel the migration
> > > > >
> > > > > Link: 
> > > > > https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
> > > > > Suggested-by: Martin KaFai Lau 
> > > > > Signed-off-by: Kuniyuki Iwashima 
> > > > > ---
> > > > >  include/uapi/linux/bpf.h   | 2 ++
> > > > >  kernel/bpf/syscall.c   | 8 
> > > > >  tools/include/uapi/linux/bpf.h | 2 ++
> > > > >  3 files changed, 12 insertions(+)
> > > > >
> > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > > index 85278deff439..cfc207ae7782 100644
> > > > > --- a/include/uapi/linux/bpf.h
> > > > > +++ b/include/uapi/linux/bpf.h
> > > > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > > > BPF_XDP_CPUMAP,
> > > > > BPF_SK_LOOKUP,
> > > > > BPF_XDP,
> > > > > +   BPF_SK_REUSEPORT_SELECT,
> > > > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > > > __MAX_BPF_ATTACH_TYPE
> > > > >  };
> > > > >
> > > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > > > index f3fe9f53f93c..a0796a8de5ea 100644
> > > > > --- a/kernel/bpf/syscall.c
> > > > > +++ b/kernel/bpf/syscall.c
> > > > > @@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type 
> > > > > prog_type,
> > > > > if (expected_attach_type == BPF_SK_LOOKUP)
> > > > > return 0;
> > > > > return -EINVAL;
> > > > > +   case BPF_PROG_TYPE_SK_REUSEPORT:
> > > > > +   switch (expected_attach_type) {
> > > > > +   case BPF_SK_REUSEPORT_SELECT:
> > > > > +   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
> > > > > +   return 0;
> > > > > +   default:
> > > > > +   return -EINVAL;
> > > > > +   }
> > > > 
> > > > this is a kernel regression, previously expected_attach_type wasn't
> > > > enforced, so user-space could have provided any number without an
> > > > error.
> > > I also think this change alone will break things like when the usual
> > > attr->expected_attach_type == 0 case.  At least changes is needed in
> > > bpf_prog_load_fixup_attach_type() which is also handling a
> > > similar situation for BPF_PROG_TYPE_CGROUP_SOCK.
> > > 
> > > I now think there is no need to expose new bpf_attach_type to the UAPI.
> > > Since the prog->expected_attach_type is not used, it can be cleared at 
> > > load time
> > > and then only set to BPF_SK_REUSEPORT_SELECT_OR_MIGRATE (probably defined
> > > internally at filter.[c|h]) in the is_valid_access() when "migration"
> > > is accessed.  When "migration" is accessed, the bpf prog can handle
> > > migration (and the original not-migration) case.
> > Scrap this internal only BPF_SK_REUSEPORT_SELECT_OR_MIGRATE idea.
> > I think there will be cases that bpf prog wants to do both
> > without accessing any field from sk_reuseport_md.
> > 
> > Lets go back to the discussion on using a similar
> > idea as BPF_PROG_TYPE_CGROUP_SOCK in bpf_prog_load_fixup_attach_type().
> > I am not aware there is loader setting a random number
> > in expected_attach_type, so the chance of breaking
> > is very low.  There was a similar discussion earlier [0].
> > 
> > [0]: https://lore.kernel.org/netdev/20200126045443.f47dzxdglazzchfm@ast-mbp/
> 
> Thank you for the idea and reference.
> 
> I will remove the change in bpf_prog_load_check_attach() and set the
> default value (BPF_SK_REUSEPORT_SELECT) in bpf_prog_load_fixup_attach_type()
> for backward compatibility if expected_attach_type is 0.
check_attach_type() can be kept.  You can refer to
commit aac3fc320d94 for a similar situation.


Re: [PATCH bpf-next v4 6/6] bpf: Test bpf_sk_storage_get in tcp iterators

2020-12-03 Thread Martin KaFai Lau
On Wed, Dec 02, 2020 at 09:55:27PM +0100, Florent Revest wrote:
> This extends the existing bpf_sk_storage_get test where a socket is
> created and tagged with its creator's pid by a task_file iterator.
> 
> A TCP iterator is now also used at the end of the test to negate the
> values already stored in the local storage. The test therefore expects
> -getpid() to be stored in the local storage.
> 
> Signed-off-by: Florent Revest 
> Acked-by: Yonghong Song 
> ---
>  .../selftests/bpf/prog_tests/bpf_iter.c| 13 +
>  .../progs/bpf_iter_bpf_sk_storage_helpers.c| 18 ++
>  2 files changed, 31 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c 
> b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> index 9336d0f18331..b8362147c9e3 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> @@ -978,6 +978,8 @@ static void test_bpf_sk_storage_delete(void)
>  /* This creates a socket and its local storage. It then runs a task_iter BPF
>   * program that replaces the existing socket local storage with the tgid of 
> the
>   * only task owning a file descriptor to this socket, this process, 
> prog_tests.
> + * It then runs a tcp socket iterator that negates the value in the existing
> + * socket local storage, the test verifies that the resulting value is -pid.
>   */
>  static void test_bpf_sk_storage_get(void)
>  {
> @@ -994,6 +996,10 @@ static void test_bpf_sk_storage_get(void)
>   if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno))
>   goto out;
>  
> + err = listen(sock_fd, 1);
> + if (CHECK(err != 0, "listen", "errno: %d\n", errno))
> + goto out;

goto close_socket;

> +
>   map_fd = bpf_map__fd(skel->maps.sk_stg_map);
>  
>   err = bpf_map_update_elem(map_fd, _fd, , BPF_NOEXIST);
> @@ -1007,6 +1013,13 @@ static void test_bpf_sk_storage_get(void)
> "map value wasn't set correctly (expected %d, got %d, err=%d)\n",
> getpid(), val, err);
The failure of this CHECK here should "goto close_socket;" now.

Others LGTM.

Acked-by: Martin KaFai Lau 

>  
> + do_dummy_read(skel->progs.negate_socket_local_storage);
> +
> + err = bpf_map_lookup_elem(map_fd, _fd, );
> + CHECK(err || val != -getpid(), "bpf_map_lookup_elem",
> +   "map value wasn't set correctly (expected %d, got %d, err=%d)\n",
> +   -getpid(), val, err);
> +
>  close_socket:
>   close(sock_fd);
>  out:


Re: [PATCH bpf-next v4 5/6] bpf: Add an iterator selftest for bpf_sk_storage_get

2020-12-03 Thread Martin KaFai Lau
On Wed, Dec 02, 2020 at 09:55:26PM +0100, Florent Revest wrote:
> The eBPF program iterates over all files and tasks. For all socket
> files, it stores the tgid of the last task it encountered with a handle
> to that socket. This is a heuristic for finding the "owner" of a socket
> similar to what's done by lsof, ss, netstat or fuser. Potentially, this
> information could be used from a cgroup_skb/*gress hook to try to
> associate network traffic with processes.
> 
> The test makes sure that a socket it created is tagged with prog_tests's
> pid.
Acked-by: Martin KaFai Lau 


Re: [PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

2020-12-02 Thread Martin KaFai Lau
On Wed, Dec 02, 2020 at 11:19:02AM -0800, Martin KaFai Lau wrote:
> On Tue, Dec 01, 2020 at 06:04:50PM -0800, Andrii Nakryiko wrote:
> > On Tue, Dec 1, 2020 at 6:49 AM Kuniyuki Iwashima  
> > wrote:
> > >
> > > This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
> > > check if the attached eBPF program is capable of migrating sockets.
> > >
> > > When the eBPF program is attached, the kernel runs it for socket migration
> > > only if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > > The kernel will change the behaviour depending on the returned value:
> > >
> > >   - SK_PASS with selected_sk, select it as a new listener
> > >   - SK_PASS with selected_sk NULL, fall back to the random selection
> > >   - SK_DROP, cancel the migration
> > >
> > > Link: 
> > > https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
> > > Suggested-by: Martin KaFai Lau 
> > > Signed-off-by: Kuniyuki Iwashima 
> > > ---
> > >  include/uapi/linux/bpf.h   | 2 ++
> > >  kernel/bpf/syscall.c   | 8 
> > >  tools/include/uapi/linux/bpf.h | 2 ++
> > >  3 files changed, 12 insertions(+)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 85278deff439..cfc207ae7782 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > BPF_XDP_CPUMAP,
> > > BPF_SK_LOOKUP,
> > > BPF_XDP,
> > > +   BPF_SK_REUSEPORT_SELECT,
> > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > __MAX_BPF_ATTACH_TYPE
> > >  };
> > >
> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > index f3fe9f53f93c..a0796a8de5ea 100644
> > > --- a/kernel/bpf/syscall.c
> > > +++ b/kernel/bpf/syscall.c
> > > @@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type 
> > > prog_type,
> > > if (expected_attach_type == BPF_SK_LOOKUP)
> > > return 0;
> > > return -EINVAL;
> > > +   case BPF_PROG_TYPE_SK_REUSEPORT:
> > > +   switch (expected_attach_type) {
> > > +   case BPF_SK_REUSEPORT_SELECT:
> > > +   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
> > > +   return 0;
> > > +   default:
> > > +   return -EINVAL;
> > > +   }
> > 
> > this is a kernel regression, previously expected_attach_type wasn't
> > enforced, so user-space could have provided any number without an
> > error.
> I also think this change alone will break things like when the usual
> attr->expected_attach_type == 0 case.  At least changes is needed in
> bpf_prog_load_fixup_attach_type() which is also handling a
> similar situation for BPF_PROG_TYPE_CGROUP_SOCK.
> 
> I now think there is no need to expose new bpf_attach_type to the UAPI.
> Since the prog->expected_attach_type is not used, it can be cleared at load 
> time
> and then only set to BPF_SK_REUSEPORT_SELECT_OR_MIGRATE (probably defined
> internally at filter.[c|h]) in the is_valid_access() when "migration"
> is accessed.  When "migration" is accessed, the bpf prog can handle
> migration (and the original not-migration) case.
Scrap this internal only BPF_SK_REUSEPORT_SELECT_OR_MIGRATE idea.
I think there will be cases that bpf prog wants to do both
without accessing any field from sk_reuseport_md.

Lets go back to the discussion on using a similar
idea as BPF_PROG_TYPE_CGROUP_SOCK in bpf_prog_load_fixup_attach_type().
I am not aware there is loader setting a random number
in expected_attach_type, so the chance of breaking
is very low.  There was a similar discussion earlier [0].

[0]: https://lore.kernel.org/netdev/20200126045443.f47dzxdglazzchfm@ast-mbp/

> 
> > 
> > > case BPF_PROG_TYPE_EXT:
> > > if (expected_attach_type)
> > > return -EINVAL;
> > > diff --git a/tools/include/uapi/linux/bpf.h 
> > > b/tools/include/uapi/linux/bpf.h
> > > index 85278deff439..cfc207ae7782 100644
> > > --- a/tools/include/uapi/linux/bpf.h
> > > +++ b/tools/include/uapi/linux/bpf.h
> > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > BPF_XDP_CPUMAP,
> > > BPF_SK_LOOKUP,
> > > BPF_XDP,
> > > +   BPF_SK_REUSEPORT_SELECT,
> > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > __MAX_BPF_ATTACH_TYPE
> > >  };
> > >
> > > --
> > > 2.17.2 (Apple Git-113)
> > >


Re: [PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

2020-12-02 Thread Martin KaFai Lau
On Tue, Dec 01, 2020 at 06:04:50PM -0800, Andrii Nakryiko wrote:
> On Tue, Dec 1, 2020 at 6:49 AM Kuniyuki Iwashima  wrote:
> >
> > This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
> > check if the attached eBPF program is capable of migrating sockets.
> >
> > When the eBPF program is attached, the kernel runs it for socket migration
> > only if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > The kernel will change the behaviour depending on the returned value:
> >
> >   - SK_PASS with selected_sk, select it as a new listener
> >   - SK_PASS with selected_sk NULL, fall back to the random selection
> >   - SK_DROP, cancel the migration
> >
> > Link: 
> > https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
> > Suggested-by: Martin KaFai Lau 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/uapi/linux/bpf.h   | 2 ++
> >  kernel/bpf/syscall.c   | 8 
> >  tools/include/uapi/linux/bpf.h | 2 ++
> >  3 files changed, 12 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 85278deff439..cfc207ae7782 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > BPF_XDP_CPUMAP,
> > BPF_SK_LOOKUP,
> > BPF_XDP,
> > +   BPF_SK_REUSEPORT_SELECT,
> > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > __MAX_BPF_ATTACH_TYPE
> >  };
> >
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index f3fe9f53f93c..a0796a8de5ea 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type 
> > prog_type,
> > if (expected_attach_type == BPF_SK_LOOKUP)
> > return 0;
> > return -EINVAL;
> > +   case BPF_PROG_TYPE_SK_REUSEPORT:
> > +   switch (expected_attach_type) {
> > +   case BPF_SK_REUSEPORT_SELECT:
> > +   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
> > +   return 0;
> > +   default:
> > +   return -EINVAL;
> > +   }
> 
> this is a kernel regression, previously expected_attach_type wasn't
> enforced, so user-space could have provided any number without an
> error.
I also think this change alone will break things like when the usual
attr->expected_attach_type == 0 case.  At least changes is needed in
bpf_prog_load_fixup_attach_type() which is also handling a
similar situation for BPF_PROG_TYPE_CGROUP_SOCK.

I now think there is no need to expose new bpf_attach_type to the UAPI.
Since the prog->expected_attach_type is not used, it can be cleared at load time
and then only set to BPF_SK_REUSEPORT_SELECT_OR_MIGRATE (probably defined
internally at filter.[c|h]) in the is_valid_access() when "migration"
is accessed.  When "migration" is accessed, the bpf prog can handle
migration (and the original not-migration) case.

> 
> > case BPF_PROG_TYPE_EXT:
> > if (expected_attach_type)
> > return -EINVAL;
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index 85278deff439..cfc207ae7782 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > BPF_XDP_CPUMAP,
> > BPF_SK_LOOKUP,
> > BPF_XDP,
> > +   BPF_SK_REUSEPORT_SELECT,
> > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > __MAX_BPF_ATTACH_TYPE
> >  };
> >
> > --
> > 2.17.2 (Apple Git-113)
> >


Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-11-22 Thread Martin KaFai Lau
On Sat, Nov 21, 2020 at 07:13:22PM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Thu, 19 Nov 2020 17:53:46 -0800
> > On Fri, Nov 20, 2020 at 07:09:22AM +0900, Kuniyuki Iwashima wrote:
> > > From: Martin KaFai Lau 
> > > Date: Wed, 18 Nov 2020 15:50:17 -0800
> > > > On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> > > > > This patch lets reuseport_detach_sock() return a pointer of struct 
> > > > > sock,
> > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > sockets from the closing listener to the selected one.
> > > > > 
> > > > > Listening sockets hold incoming connections as a linked list of struct
> > > > > request_sock in the accept queue, and each request has reference to a 
> > > > > full
> > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink 
> > > > > the
> > > > > requests from the closing listener's queue and relink them to the 
> > > > > head of
> > > > > the new listener's queue. We do not process each request, so the 
> > > > > migration
> > > > > completes in O(1) time complexity. However, in the case of 
> > > > > TCP_SYN_RECV
> > > > > sockets, we will take special care in the next commit.
> > > > > 
> > > > > By default, we select the last element of socks[] as the new listener.
> > > > > This behaviour is based on how the kernel moves sockets in socks[].
> > > > > 
> > > > > For example, we call listen() for four sockets (A, B, C, D), and 
> > > > > close the
> > > > > first two by turns. The sockets move in socks[] like below. (See also 
> > > > > [1])
> > > > > 
> > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > >   socks[2] : C   |  socks[2] : C --'
> > > > >   socks[3] : D --'
> > > > > 
> > > > > Then, if C and D have newer settings than A and B, and each socket 
> > > > > has a
> > > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > > requests evenly to new listeners.
> > > > I don't think it should emphasize/claim there is a specific way that
> > > > the kernel-pick here can redistribute the requests evenly.  It depends 
> > > > on
> > > > how the application close/listen.  The userspace can not expect the
> > > > ordering of socks[] will behave in a certain way.
> > > 
> > > I've expected replacing listeners by generations as a general use case.
> > > But exactly. Users should not expect the undocumented kernel internal.
> > > 
> > > 
> > > > The primary redistribution policy has to depend on BPF which is the
> > > > policy defined by the user based on its application logic (e.g. how
> > > > its binary restart work).  The application (and bpf) knows which one
> > > > is a dying process and can avoid distributing to it.
> > > > 
> > > > The kernel-pick could be an optional fallback but not a must.  If the 
> > > > bpf
> > > > prog is attached, I would even go further to call bpf to redistribute
> > > > regardless of the sysctl, so I think the sysctl is not necessary.
> > > 
> > > I also think it is just an optional fallback, but to pick out a different
> > > listener everytime, choosing the moved socket was reasonable. So the even
> > > redistribution for a specific use case is a side effect of such socket
> > > selection.
> > > 
> > > But, users should decide to use either way:
> > >   (1) let the kernel select a new listener randomly
> > >   (2) select a particular listener by eBPF
> > > 
> > > I will update the commit message like:
> > > The kernel selects a new listener randomly, but as the side effect, it can
> > > redistribute packets evenly for a specific case where an application
> > > replaces listeners by generations.
> > Since there is no feedback on sysctl, so may be something missed
> > in the lines.
> 
> I'm sorry, I have missed this point while thinking about each reply...
> 
> 
> > I don't think this migration logic shoul

Re: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

2020-11-19 Thread Martin KaFai Lau
On Fri, Nov 20, 2020 at 07:17:49AM +0900, Kuniyuki Iwashima wrote:
> From:   Martin KaFai Lau 
> Date:   Wed, 18 Nov 2020 17:49:13 -0800
> > On Tue, Nov 17, 2020 at 06:40:15PM +0900, Kuniyuki Iwashima wrote:
> > > The SO_REUSEPORT option allows sockets to listen on the same port and to
> > > accept connections evenly. However, there is a defect in the current
> > > implementation. When a SYN packet is received, the connection is tied to a
> > > listening socket. Accordingly, when the listener is closed, in-flight
> > > requests during the three-way handshake and child sockets in the accept
> > > queue are dropped even if other listeners could accept such connections.
> > > 
> > > This situation can happen when various server management tools restart
> > > server (such as nginx) processes. For instance, when we change nginx
> > > configurations and restart it, it spins up new workers that respect the 
> > > new
> > > configuration and closes all listeners on the old workers, resulting in
> > > in-flight ACK of 3WHS is responded by RST.
> > > 
> > > As a workaround for this issue, we can do connection draining by eBPF:
> > > 
> > >   1. Before closing a listener, stop routing SYN packets to it.
> > >   2. Wait enough time for requests to complete 3WHS.
> > >   3. Accept connections until EAGAIN, then close the listener.
> > > 
> > > Although this approach seems to work well, EAGAIN has nothing to do with
> > > how many requests are still during 3WHS. Thus, we have to know the number
> > It sounds like the application can already drain the established socket
> > by accept()?  To solve the problem that you have,
> > does it mean migrating req_sk (the in-progress 3WHS) is enough?
> 
> Ideally, the application needs to drain only the accepted sockets because
> 3WHS and tying a connection to a listener are just kernel behaviour. Also,
> there are some cases where we want to apply new configurations as soon as
> possible such as replacing TLS certificates.
> 
> It is possible to drain the established sockets by accept(), but the
> sockets in the accept queue have not started application sessions yet. So,
> if we do not drain such sockets (or if the kernel happened to select
> another listener), we can (could) apply the new settings much earlier.
> 
> Moreover, the established sockets may start long-standing connections so
> that we cannot complete draining for a long time and may have to
> force-close them (and they would have longer lifetime if they are migrated
> to a new listener).
> 
> 
> > Applications can already use the bpf prog to do (1) and divert
> > the SYN to the newly started process.
> > 
> > If the application cares about service disruption,
> > it usually needs to drain the fd(s) that it already has and
> > finishes serving the pending request (e.g. https) on them anyway.
> > The time taking to finish those could already be longer than it takes
> > to drain the accept queue or finish off the 3WHS in reasonable time.
> > or the application that you have does not need to drain the fd(s) 
> > it already has and it can close them immediately?
> 
> In the point of view of service disruption, I agree with you.
> 
> However, I think that there are some situations where we want to apply new
> configurations rather than to drain sockets with old configurations and
> that if the kernel migrates sockets automatically, we can simplify user
> programs.
This configuration-update(/new-TLS-cert...etc) consideration will be useful
if it is also included in the cover letter.

It sounds like the service that you have is draining the existing
already-accepted fd(s) which are using the old configuration.
Those existing fd(s) could also be long life.  Potentially those
existing fd(s) will be in a much higher number than the
to-be-accepted fd(s)?

or you meant in some cases it wants to migrate to the new configuration
ASAP (e.g. for security reason) even it has to close all the
already-accepted fds() which are using the old configuration??

In either cases, considering the already-accepted fd(s)
is usually in a much more number, does the to-be-accepted
connection make any difference percentage-wise?


Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-11-19 Thread Martin KaFai Lau
On Fri, Nov 20, 2020 at 07:09:22AM +0900, Kuniyuki Iwashima wrote:
> From: Martin KaFai Lau 
> Date: Wed, 18 Nov 2020 15:50:17 -0800
> > On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> > > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > > which is used only by inet_unhash(). If it is not NULL,
> > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > sockets from the closing listener to the selected one.
> > > 
> > > Listening sockets hold incoming connections as a linked list of struct
> > > request_sock in the accept queue, and each request has reference to a full
> > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink the
> > > requests from the closing listener's queue and relink them to the head of
> > > the new listener's queue. We do not process each request, so the migration
> > > completes in O(1) time complexity. However, in the case of TCP_SYN_RECV
> > > sockets, we will take special care in the next commit.
> > > 
> > > By default, we select the last element of socks[] as the new listener.
> > > This behaviour is based on how the kernel moves sockets in socks[].
> > > 
> > > For example, we call listen() for four sockets (A, B, C, D), and close the
> > > first two by turns. The sockets move in socks[] like below. (See also [1])
> > > 
> > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > >   socks[2] : C   |  socks[2] : C --'
> > >   socks[3] : D --'
> > > 
> > > Then, if C and D have newer settings than A and B, and each socket has a
> > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > requests evenly to new listeners.
> > I don't think it should emphasize/claim there is a specific way that
> > the kernel-pick here can redistribute the requests evenly.  It depends on
> > how the application close/listen.  The userspace can not expect the
> > ordering of socks[] will behave in a certain way.
> 
> I've expected replacing listeners by generations as a general use case.
> But exactly. Users should not expect the undocumented kernel internal.
> 
> 
> > The primary redistribution policy has to depend on BPF which is the
> > policy defined by the user based on its application logic (e.g. how
> > its binary restart work).  The application (and bpf) knows which one
> > is a dying process and can avoid distributing to it.
> > 
> > The kernel-pick could be an optional fallback but not a must.  If the bpf
> > prog is attached, I would even go further to call bpf to redistribute
> > regardless of the sysctl, so I think the sysctl is not necessary.
> 
> I also think it is just an optional fallback, but to pick out a different
> listener everytime, choosing the moved socket was reasonable. So the even
> redistribution for a specific use case is a side effect of such socket
> selection.
> 
> But, users should decide to use either way:
>   (1) let the kernel select a new listener randomly
>   (2) select a particular listener by eBPF
> 
> I will update the commit message like:
> The kernel selects a new listener randomly, but as the side effect, it can
> redistribute packets evenly for a specific case where an application
> replaces listeners by generations.
Since there is no feedback on sysctl, so may be something missed
in the lines.

I don't think this migration logic should depend on a sysctl.
At least not when a bpf prog is attached that is capable of doing
migration, it is too fragile to ask user to remember to turn on
the sysctl before attaching the bpf prog.

Your use case is to primarily based on bpf prog to pick or only based
on kernel to do a random pick?

Also, IIUC, this sysctl setting sticks at "*reuse", there is no way to
change it until all the listening sockets are closed which is exactly
the service disruption problem this series is trying to solve here.


Re: [PATCH v2 5/5] bpf: Add an iterator selftest for bpf_sk_storage_get

2020-11-19 Thread Martin KaFai Lau
On Thu, Nov 19, 2020 at 05:26:54PM +0100, Florent Revest wrote:
> From: Florent Revest 
> 
> The eBPF program iterates over all files and tasks. For all socket
> files, it stores the tgid of the last task it encountered with a handle
> to that socket. This is a heuristic for finding the "owner" of a socket
> similar to what's done by lsof, ss, netstat or fuser. Potentially, this
> information could be used from a cgroup_skb/*gress hook to try to
> associate network traffic with processes.
> 
> The test makes sure that a socket it created is tagged with prog_tests's
> pid.
> 
> Signed-off-by: Florent Revest 
> ---
>  .../selftests/bpf/prog_tests/bpf_iter.c   | 35 +++
>  .../progs/bpf_iter_bpf_sk_storage_helpers.c   | 26 ++
>  2 files changed, 61 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c 
> b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> index bb4a638f2e6f..4d0626003c03 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> @@ -975,6 +975,39 @@ static void test_bpf_sk_storage_delete(void)
>   bpf_iter_bpf_sk_storage_helpers__destroy(skel);
>  }
>  
> +/* The BPF program stores in every socket the tgid of a task owning a handle 
> to
> + * it. The test verifies that a locally-created socket is tagged with its pid
> + */
> +static void test_bpf_sk_storage_get(void)
> +{
> + struct bpf_iter_bpf_sk_storage_helpers *skel;
> + int err, map_fd, val = -1;
> + int sock_fd = -1;
> +
> + skel = bpf_iter_bpf_sk_storage_helpers__open_and_load();
> + if (CHECK(!skel, "bpf_iter_bpf_sk_storage_helpers__open_and_load",
> +   "skeleton open_and_load failed\n"))
> + return;
> +
> + sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
> + if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno))
> + goto out;
> +
> + do_dummy_read(skel->progs.fill_socket_owners);
> +
> + map_fd = bpf_map__fd(skel->maps.sk_stg_map);
> +
> + err = bpf_map_lookup_elem(map_fd, _fd, );
> + CHECK(err || val != getpid(), "bpf_map_lookup_elem",
> +   "map value wasn't set correctly (expected %d, got %d, err=%d)\n",
> +   getpid(), val, err);
> +
> + if (sock_fd >= 0)
> + close(sock_fd);
> +out:
> + bpf_iter_bpf_sk_storage_helpers__destroy(skel);
> +}
> +
>  static void test_bpf_sk_storage_map(void)
>  {
>   DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
> @@ -1131,6 +1164,8 @@ void test_bpf_iter(void)
>   test_bpf_sk_storage_map();
>   if (test__start_subtest("bpf_sk_storage_delete"))
>   test_bpf_sk_storage_delete();
> + if (test__start_subtest("bpf_sk_storage_get"))
> + test_bpf_sk_storage_get();
>   if (test__start_subtest("rdonly-buf-out-of-bound"))
>   test_rdonly_buf_out_of_bound();
>   if (test__start_subtest("buf-neg-offset"))
> diff --git 
> a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c 
> b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
> index 01ff3235e413..7206fd6f09ab 100644
> --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c
> @@ -21,3 +21,29 @@ int delete_bpf_sk_storage_map(struct 
> bpf_iter__bpf_sk_storage_map *ctx)
>  
>   return 0;
>  }
> +
> +SEC("iter/task_file")
> +int fill_socket_owners(struct bpf_iter__task_file *ctx)
> +{
> + struct task_struct *task = ctx->task;
> + struct file *file = ctx->file;
> + struct socket *sock;
> + int *sock_tgid;
> +
> + if (!task || !file || task->tgid != task->pid)
> + return 0;
> +
> + sock = bpf_sock_from_file(file);
> + if (!sock)
> + return 0;
> +
> + sock_tgid = bpf_sk_storage_get(_stg_map, sock->sk, 0,
> +BPF_SK_STORAGE_GET_F_CREATE);
Does it affect all sk(s) in the system?  Can it be limited to
the sk that the test is testing?


Re: [PATCH v2 4/5] bpf: Add an iterator selftest for bpf_sk_storage_delete

2020-11-19 Thread Martin KaFai Lau
On Thu, Nov 19, 2020 at 05:26:53PM +0100, Florent Revest wrote:
> From: Florent Revest 
> 
> The eBPF program iterates over all entries (well, only one) of a socket
> local storage map and deletes them all. The test makes sure that the
> entry is indeed deleted.
Note that if there are many entries and seq->op->stop() is called (due to
seq_has_overflowed()).  It is possible that not all of the entries will be
iterated (and deleted).  However, I think it is a more generic issue in
resuming the iteration and not specific to this series.

Acked-by: Martin KaFai Lau 


Re: [PATCH v2 3/5] bpf: Expose bpf_sk_storage_* to iterator programs

2020-11-19 Thread Martin KaFai Lau
On Thu, Nov 19, 2020 at 05:26:52PM +0100, Florent Revest wrote:
> From: Florent Revest 
> 
> Iterators are currently used to expose kernel information to userspace
> over fast procfs-like files but iterators could also be used to
> manipulate local storage. For example, the task_file iterator could be
> used to initialize a socket local storage with associations between
> processes and sockets or to selectively delete local storage values.
> 
> This exposes both socket local storage helpers to all iterators.
> Alternatively we could expose it to only certain iterators with strcmps
> on prog->aux->attach_func_name.
I cannot see any hole to iter the bpf_sk_storage_map and also
bpf_sk_storage_get/delete() itself for now.

I have looked at other iters (e.g. tcp, udp, and sock_map iter).
It will be good if you can double check them also.

I think at least one more test on the tcp iter is needed.

Other than that,

Acked-by: Martin KaFai Lau 


Re: [PATCH v2 2/5] bpf: Add a bpf_sock_from_file helper

2020-11-19 Thread Martin KaFai Lau
On Thu, Nov 19, 2020 at 05:26:51PM +0100, Florent Revest wrote:
> From: Florent Revest 
> 
> While eBPF programs can check whether a file is a socket by file->f_op
> == _file_ops, they cannot convert the void private_data pointer
> to a struct socket BTF pointer. In order to do this a new helper
> wrapping sock_from_file is added.
> 
> This is useful to tracing programs but also other program types
> inheriting this set of helpers such as iterators or LSM programs.
Acked-by: Martin KaFai Lau 


Re: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

2020-11-18 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 06:40:15PM +0900, Kuniyuki Iwashima wrote:
> The SO_REUSEPORT option allows sockets to listen on the same port and to
> accept connections evenly. However, there is a defect in the current
> implementation. When a SYN packet is received, the connection is tied to a
> listening socket. Accordingly, when the listener is closed, in-flight
> requests during the three-way handshake and child sockets in the accept
> queue are dropped even if other listeners could accept such connections.
> 
> This situation can happen when various server management tools restart
> server (such as nginx) processes. For instance, when we change nginx
> configurations and restart it, it spins up new workers that respect the new
> configuration and closes all listeners on the old workers, resulting in
> in-flight ACK of 3WHS is responded by RST.
> 
> As a workaround for this issue, we can do connection draining by eBPF:
> 
>   1. Before closing a listener, stop routing SYN packets to it.
>   2. Wait enough time for requests to complete 3WHS.
>   3. Accept connections until EAGAIN, then close the listener.
> 
> Although this approach seems to work well, EAGAIN has nothing to do with
> how many requests are still during 3WHS. Thus, we have to know the number
It sounds like the application can already drain the established socket
by accept()?  To solve the problem that you have,
does it mean migrating req_sk (the in-progress 3WHS) is enough?

Applications can already use the bpf prog to do (1) and divert
the SYN to the newly started process.

If the application cares about service disruption,
it usually needs to drain the fd(s) that it already has and
finishes serving the pending request (e.g. https) on them anyway.
The time taking to finish those could already be longer than it takes
to drain the accept queue or finish off the 3WHS in reasonable time.
or the application that you have does not need to drain the fd(s) 
it already has and it can close them immediately?

> of such requests by counting SYN packets by eBPF to complete connection
> draining.
> 
>   1. Start counting SYN packets and accept syscalls using eBPF map.
>   2. Stop routing SYN packets.
>   3. Accept connections up to the count, then close the listener.


Re: [RFC PATCH bpf-next 7/8] bpf: Call bpf_run_sk_reuseport() for socket migration.

2020-11-18 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 06:40:22PM +0900, Kuniyuki Iwashima wrote:
> This patch makes it possible to select a new listener for socket migration
> by eBPF.
> 
> The noteworthy point is that we select a listening socket in
> reuseport_detach_sock() and reuseport_select_sock(), but we do not have
> struct skb in the unhash path.
> 
> Since we cannot pass skb to the eBPF program, we run only the
> BPF_PROG_TYPE_SK_REUSEPORT program by calling bpf_run_sk_reuseport() with
> skb NULL. So, some fields derived from skb are also NULL in the eBPF
> program.
More things need to be considered here when skb is NULL.

Some helpers are probably assuming skb is not NULL.

Also, the sk_lookup in filter.c is actually passing a NULL skb to avoid
doing the reuseport select.

> 
> Moreover, we can cancel migration by returning SK_DROP. This feature is
> useful when listeners have different settings at the socket API level or
> when we want to free resources as soon as possible.
> 
> Reviewed-by: Benjamin Herrenschmidt 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  net/core/filter.c  | 26 +-
>  net/core/sock_reuseport.c  | 23 ---
>  net/ipv4/inet_hashtables.c |  2 +-
>  3 files changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 01e28f283962..ffc4591878b8 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -8914,6 +8914,22 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type 
> type,
>   SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
>BPF_FIELD_SIZEOF(NS, NF), 0)
>  
> +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF, SIZE, 
> OFF)\
> + do {
> \
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,  
> \
> +   si->src_reg, offsetof(S, F)); 
> \
> + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);  
> \
Although it may not matter much, always doing this check seems not very ideal
considering the fast path will always have skb and only the slow
path (accept-queue migrate) has skb is NULL.  I think the req_sk usually
has the skb also except the timer one.

First thought is to create a temp skb but it has its own issues.
or it may actually belong to a new prog type.  However, lets keep
exploring possible options (including NULL skb).

> + *insn++ = BPF_LDX_MEM(  
> \
> + SIZE, si->dst_reg, si->dst_reg, 
> \
> + bpf_target_off(NS, NF, sizeof_field(NS, NF),
> \
> +target_size) 
> \
> + + OFF); 
> \
> + } while (0)


Re: [RFC PATCH bpf-next 6/8] bpf: Add cookie in sk_reuseport_md.

2020-11-18 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 06:40:21PM +0900, Kuniyuki Iwashima wrote:
> We will call sock_reuseport.prog for socket migration in the next commit,
> so the eBPF program has to know which listener is closing in order to
> select the new listener.
> 
> Currently, we can get a unique ID for each listener in the userspace by
> calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.
> This patch exposes the ID to the eBPF program.
> 
> Reviewed-by: Benjamin Herrenschmidt 
> Signed-off-by: Kuniyuki Iwashima 
> ---
>  include/linux/bpf.h| 1 +
>  include/uapi/linux/bpf.h   | 1 +
>  net/core/filter.c  | 8 
>  tools/include/uapi/linux/bpf.h | 1 +
>  4 files changed, 11 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 581b2a2e78eb..c0646eceffa2 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1897,6 +1897,7 @@ struct sk_reuseport_kern {
>   u32 hash;
>   u32 reuseport_id;
>   bool bind_inany;
> + u64 cookie;
>  };
>  bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type 
> type,
> struct bpf_insn_access_aux *info);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 162999b12790..3fcddb032838 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4403,6 +4403,7 @@ struct sk_reuseport_md {
>   __u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
>   __u32 bind_inany;   /* Is sock bound to an INANY address? */
>   __u32 hash; /* A hash of the packet 4 tuples */
> + __u64 cookie;   /* ID of the listener in map */
Instead of only adding the cookie of a sk, lets make the sk pointer available:

__bpf_md_ptr(struct bpf_sock *, sk);

and then use the BPF_FUNC_get_socket_cookie to get the cookie.

Other fields of the sk can also be directly accessed too once
the sk pointer is available.


Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-11-18 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> This patch lets reuseport_detach_sock() return a pointer of struct sock,
> which is used only by inet_unhash(). If it is not NULL,
> inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> sockets from the closing listener to the selected one.
> 
> Listening sockets hold incoming connections as a linked list of struct
> request_sock in the accept queue, and each request has reference to a full
> socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink the
> requests from the closing listener's queue and relink them to the head of
> the new listener's queue. We do not process each request, so the migration
> completes in O(1) time complexity. However, in the case of TCP_SYN_RECV
> sockets, we will take special care in the next commit.
> 
> By default, we select the last element of socks[] as the new listener.
> This behaviour is based on how the kernel moves sockets in socks[].
> 
> For example, we call listen() for four sockets (A, B, C, D), and close the
> first two by turns. The sockets move in socks[] like below. (See also [1])
> 
>   socks[0] : A <-.  socks[0] : D  socks[0] : D
>   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
>   socks[2] : C   |  socks[2] : C --'
>   socks[3] : D --'
> 
> Then, if C and D have newer settings than A and B, and each socket has a
> request (a, b, c, d) in their accept queue, we can redistribute old
> requests evenly to new listeners.
I don't think it should emphasize/claim there is a specific way that
the kernel-pick here can redistribute the requests evenly.  It depends on
how the application close/listen.  The userspace can not expect the
ordering of socks[] will behave in a certain way.

The primary redistribution policy has to depend on BPF which is the
policy defined by the user based on its application logic (e.g. how
its binary restart work).  The application (and bpf) knows which one
is a dying process and can avoid distributing to it.

The kernel-pick could be an optional fallback but not a must.  If the bpf
prog is attached, I would even go further to call bpf to redistribute
regardless of the sysctl, so I think the sysctl is not necessary.

> 
>   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
>   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
>   socks[2] : C (c)   |  socks[2] : C (c) --'
>   socks[3] : D (d) --'
> 


Re: [PATCH bpf-next v3 2/2] bpf: Add tests for bpf_lsm_set_bprm_opts

2020-11-16 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 02:13:07AM +, KP Singh wrote:
> From: KP Singh 
> 
> The test forks a child process, updates the local storage to set/unset
> the securexec bit.
> 
> The BPF program in the test attaches to bprm_creds_for_exec which checks
> the local storage of the current task to set the secureexec bit on the
> binary parameters (bprm).
> 
> The child then execs a bash command with the environment variable
> TMPDIR set in the envp.  The bash command returns a different exit code
> based on its observed value of the TMPDIR variable.
> 
> Since TMPDIR is one of the variables that is ignored by the dynamic
> loader when the secureexec bit is set, one should expect the
> child execution to not see this value when the secureexec bit is set.
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v3 1/2] bpf: Add bpf_lsm_set_bprm_opts helper

2020-11-16 Thread Martin KaFai Lau
On Tue, Nov 17, 2020 at 02:13:06AM +, KP Singh wrote:
> From: KP Singh 
> 
> The helper allows modification of certain bits on the linux_binprm
> struct starting with the secureexec bit which can be updated using the
> BPF_LSM_F_BPRM_SECUREEXEC flag.
> 
> secureexec can be set by the LSM for privilege gaining executions to set
> the AT_SECURE auxv for glibc.  When set, the dynamic linker disables the
> use of certain environment variables (like LD_PRELOAD).
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v2 2/2] bpf: Add tests for bpf_lsm_set_bprm_opts

2020-11-16 Thread Martin KaFai Lau
On Mon, Nov 16, 2020 at 11:25:36PM +, KP Singh wrote:
> From: KP Singh 
> 
> The test forks a child process, updates the local storage to set/unset
> the securexec bit.
> 
> The BPF program in the test attaches to bprm_creds_for_exec which checks
> the local storage of the current task to set the secureexec bit on the
> binary parameters (bprm).
> 
> The child then execs a bash command with the environment variable
> TMPDIR set in the envp.  The bash command returns a different exit code
> based on its observed value of the TMPDIR variable.
> 
> Since TMPDIR is one of the variables that is ignored by the dynamic
> loader when the secureexec bit is set, one should expect the
> child execution to not see this value when the secureexec bit is set.
> 
> Signed-off-by: KP Singh 
> ---
>  .../selftests/bpf/prog_tests/test_bprm_opts.c | 124 ++
>  tools/testing/selftests/bpf/progs/bprm_opts.c |  34 +
>  2 files changed, 158 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bprm_opts.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c 
> b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
> new file mode 100644
> index ..cba1ef3dc8b4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c
> @@ -0,0 +1,124 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (C) 2020 Google LLC.
> + */
> +
> +#include 
> +#include 
Is it needed?

> +#include 
> +#include 
> +
> +#include "bprm_opts.skel.h"
> +#include "network_helpers.h"
> +
> +#ifndef __NR_pidfd_open
> +#define __NR_pidfd_open 434
> +#endif
> +
> +static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL };
> +
> +static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
> +{
> + return syscall(__NR_pidfd_open, pid, flags);
> +}
> +
> +static int update_storage(int map_fd, int secureexec)
> +{
> + int task_fd, ret = 0;
> +
> + task_fd = sys_pidfd_open(getpid(), 0);
> + if (task_fd < 0)
> + return errno;
> +
> + ret = bpf_map_update_elem(map_fd, _fd, , BPF_NOEXIST);
> + if (ret)
> + ret = errno;
> +
> + close(task_fd);
> + return ret;
> +}
> +
> +static int run_set_secureexec(int map_fd, int secureexec)
> +{
> +
> + int child_pid, child_status, ret, null_fd;
> +
> + child_pid = fork();
> + if (child_pid == 0) {
> + null_fd = open("/dev/null", O_WRONLY);
> + if (null_fd == -1)
> + exit(errno);
> + dup2(null_fd, STDOUT_FILENO);
> + dup2(null_fd, STDERR_FILENO);
> + close(null_fd);
> +
> + /* Ensure that all executions from hereon are
> +  * secure by setting a local storage which is read by
> +  * the bprm_creds_for_exec hook and sets bprm->secureexec.
> +  */
> + ret = update_storage(map_fd, secureexec);
> + if (ret)
> + exit(ret);
> +
> + /* If the binary is executed with securexec=1, the dynamic
> +  * loader ingores and unsets certain variables like LD_PRELOAD,
> +  * TMPDIR etc. TMPDIR is used here to simplify the example, as
> +  * LD_PRELOAD requires a real .so file.
> +  *
> +  * If the value of TMPDIR is set, the bash command returns 10
> +  * and if the value is unset, it returns 20.
> +  */
> + ret = execle("/bin/bash", "bash", "-c",
> +  "[[ -z \"${TMPDIR}\" ]] || exit 10 && exit 20",
> +  NULL, bash_envp);
> + if (ret)
It should never reach here?  May be just exit() unconditionally
instead of having a chance to fall-through and then return -EINVAL.

> + exit(errno);
> + } else if (child_pid > 0) {
> + waitpid(child_pid, _status, 0);
> + ret = WEXITSTATUS(child_status);
> +
> + /* If a secureexec occured, the exit status should be 20.
> +  */
> + if (secureexec && ret == 20)
> + return 0;
> +
> + /* If normal execution happened the exit code should be 10.
> +  */
> + if (!secureexec && ret == 10)
> + return 0;
> +
> + return ret;
Any chance that ret may be 0?

> + }
> +
> + return -EINVAL;
> +}
> +
> +void test_test_bprm_opts(void)
> +{
> + int err, duration = 0;
> + struct bprm_opts *skel = NULL;
> +
> + skel = bprm_opts__open_and_load();
> + if (CHECK(!skel, "skel_load", "skeleton failed\n"))
> + goto close_prog;
> +
> + err = bprm_opts__attach(skel);
> + if (CHECK(err, "attach", "attach failed: %d\n", err))
> + goto close_prog;
> +
> + /* Run the test with the secureexec bit unset */
> + err = 

Re: [PATCH bpf-next v2 1/2] bpf: Add bpf_lsm_set_bprm_opts helper

2020-11-16 Thread Martin KaFai Lau
On Mon, Nov 16, 2020 at 11:25:35PM +, KP Singh wrote:
> From: KP Singh 
> 
> The helper allows modification of certain bits on the linux_binprm
> struct starting with the secureexec bit which can be updated using the
> BPF_LSM_F_BPRM_SECUREEXEC flag.
> 
> secureexec can be set by the LSM for privilege gaining executions to set
> the AT_SECURE auxv for glibc.  When set, the dynamic linker disables the
> use of certain environment variables (like LD_PRELOAD).
> 
> Signed-off-by: KP Singh 
> ---
>  include/uapi/linux/bpf.h   | 14 ++
>  kernel/bpf/bpf_lsm.c   | 27 +++
>  scripts/bpf_helpers_doc.py |  2 ++
>  tools/include/uapi/linux/bpf.h | 14 ++
>  4 files changed, 57 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 162999b12790..7f1b6ba8246c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3787,6 +3787,14 @@ union bpf_attr {
>   *   *ARG_PTR_TO_BTF_ID* of type *task_struct*.
>   *   Return
>   *   Pointer to the current task.
> + *
> + * long bpf_lsm_set_bprm_opts(struct linux_binprm *bprm, u64 flags)
> + *
> + *   Description
> + *   Sets certain options on the *bprm*:
> + *
> + *   **BPF_LSM_F_BPRM_SECUREEXEC** Set the secureexec bit
> + *   which sets the **AT_SECURE** auxv for glibc.
The return value needs to be documented also.

>   */
>  #define __BPF_FUNC_MAPPER(FN)\
>   FN(unspec), \
> @@ -3948,6 +3956,7 @@ union bpf_attr {
>   FN(task_storage_get),   \
>   FN(task_storage_delete),\
>   FN(get_current_task_btf),   \
> + FN(lsm_set_bprm_opts),  \
>   /* */
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> @@ -4119,6 +4128,11 @@ enum bpf_lwt_encap_mode {
>   BPF_LWT_ENCAP_IP,
>  };
>  
> +/* Flags for LSM helpers */
> +enum {
> + BPF_LSM_F_BPRM_SECUREEXEC   = (1ULL << 0),
> +};
> +
>  #define __bpf_md_ptr(type, name) \
>  union {  \
>   type name;  \
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 553107f4706a..31f85474a0ef 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -7,6 +7,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -51,6 +52,30 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
>   return 0;
>  }
>  
> +/* Mask for all the currently supported BPRM option flags */
> +#define BPF_LSM_F_BRPM_OPTS_MASK 0x1ULL
If there is a need to have v3, it will be better to use
BPF_LSM_F_BPRM_SECUREEXEC instead of 0x1ULL.

> +
> +BPF_CALL_2(bpf_lsm_set_bprm_opts, struct linux_binprm *, bprm, u64, flags)
> +{
> +
> + if (flags & ~BPF_LSM_F_BRPM_OPTS_MASK)
> + return -EINVAL;
> +
> + bprm->secureexec = (flags & BPF_LSM_F_BPRM_SECUREEXEC);
The intention of this helper is to set "or clear" a bit?
It may be useful to clarify the "clear" part in the doc also.

> + return 0;
> +}
> +


Re: [FIX bpf,perf] bpf,perf: return EOPNOTSUPP for bpf handler on PERF_COUNT_SW_DUMMY

2020-11-16 Thread Martin KaFai Lau
On Mon, Nov 16, 2020 at 07:37:52PM +0100, Florian Lehner wrote:
> bpf handlers for perf events other than tracepoints, kprobes or uprobes
> are attached to the overflow_handler of the perf event.
> 
> Perf events of type software/dummy are placeholder events. So when
> attaching a bpf handle to an overflow_handler of such an event, the bpf
> handler will not be triggered.
> 
> This fix returns the error EOPNOTSUPP to indicate that attaching a bpf
> handler to a perf event of type software/dummy is not supported.
> 
> Signed-off-by: Florian Lehner 
It is missing a Fixes tag.


Re: [PATCH] bpf: Expose bpf_sk_storage_* to iterator programs

2020-11-12 Thread Martin KaFai Lau
On Thu, Nov 12, 2020 at 09:09:14PM +0100, Florent Revest wrote:
> From: Florent Revest 
> 
> Iterators are currently used to expose kernel information to userspace
> over fast procfs-like files but iterators could also be used to
> initialize local storage. For example, the task_file iterator could be
> used to store associations between processes and sockets.
> 
> This exposes the socket local storage helpers to all iterators. Martin
> Kafai checked that this was safe to call these helpers from the
> sk_storage_map iterators.
> 
> Signed-off-by: Florent Revest 
> ---
>  kernel/trace/bpf_trace.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index e4515b0f62a8..3530120fa280 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -17,6 +17,8 @@
>  #include 
>  #include 
>  
> +#include 
> +
>  #include 
>  #include 
>  
> @@ -1750,6 +1752,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, 
> const struct bpf_prog *prog)
>  NULL;
>   case BPF_FUNC_d_path:
>   return _d_path_proto;
> + case BPF_FUNC_sk_storage_get:
> + return prog->expected_attach_type == BPF_TRACE_ITER ?
> +_sk_storage_get_proto :
> +NULL;
> + case BPF_FUNC_sk_storage_delete:
> + return prog->expected_attach_type == BPF_TRACE_ITER ?
> +_sk_storage_delete_proto :
> +NULL;
Test(s) is needed.  e.g. iterating a bpf_sk_storage_map and also
calling bpf_sk_storage_get/delete.

I would expect to see another test/example
showing how it works end-to-end to solve the problem you have in hand.
This patch probably belongs to a longer series.

BTW, I am also enabling bpf_sk_storage_(get|delete) for FENTRY/FEXIT/RAW_TP
but I think the conflict should be manageable.
https://patchwork.ozlabs.org/project/netdev/patch/20201112211313.2587383-1-ka...@fb.com/


Re: [PATCH v3 bpf-next 2/5] bpf: assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO

2020-11-09 Thread Martin KaFai Lau
On Mon, Nov 09, 2020 at 01:00:21PM -0800, Andrii Nakryiko wrote:
> Allocate ID for vmlinux BTF. This makes it visible when iterating over all BTF
> objects in the system. To allow distinguishing vmlinux BTF (and later kernel
> module BTF) from user-provided BTFs, expose extra kernel_btf flag, as well as
> BTF name ("vmlinux" for vmlinux BTF, will equal to module's name for module
> BTF).  We might want to later allow specifying BTF name for user-provided BTFs
> as well, if that makes sense. But currently this is reserved only for
> in-kernel BTFs.
> 
> Having in-kernel BTFs exposed IDs will allow to extend BPF APIs that require
> in-kernel BTF type with ability to specify BTF types from kernel modules, not
> just vmlinux BTF. This will be implemented in a follow up patch set for
> fentry/fexit/fmod_ret/lsm/etc.
> 
> Signed-off-by: Andrii Nakryiko 
> ---
>  include/uapi/linux/bpf.h   |  3 +++
>  kernel/bpf/btf.c   | 39 --
>  tools/include/uapi/linux/bpf.h |  3 +++
>  3 files changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 9879d6793e90..162999b12790 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4466,6 +4466,9 @@ struct bpf_btf_info {
>   __aligned_u64 btf;
>   __u32 btf_size;
>   __u32 id;
> + __aligned_u64 name;
> + __u32 name_len;
> + __u32 kernel_btf;
>  } __attribute__((aligned(8)));
>  
>  struct bpf_link_info {
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 894ee33f4c84..663c3fb4e614 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -215,6 +215,8 @@ struct btf {
>   struct btf *base_btf;
>   u32 start_id; /* first type ID in this BTF (0 for base BTF) */
>   u32 start_str_off; /* first string offset (0 for base BTF) */
> + char name[MODULE_NAME_LEN];
> + bool kernel_btf;
>  };
>  
>  enum verifier_phase {
> @@ -4430,6 +4432,8 @@ struct btf *btf_parse_vmlinux(void)
>  
>   btf->data = __start_BTF;
>   btf->data_size = __stop_BTF - __start_BTF;
> + btf->kernel_btf = true;
> + snprintf(btf->name, sizeof(btf->name), "vmlinux");
>  
>   err = btf_parse_hdr(env);
>   if (err)
> @@ -4455,8 +4459,13 @@ struct btf *btf_parse_vmlinux(void)
>  
>   bpf_struct_ops_init(btf, log);
>  
> - btf_verifier_env_free(env);
>   refcount_set(>refcnt, 1);
> +
> + err = btf_alloc_id(btf);
> + if (err)
> + goto errout;
> +
> + btf_verifier_env_free(env);
>   return btf;
>  
>  errout:
> @@ -5554,7 +5563,8 @@ int btf_get_info_by_fd(const struct btf *btf,
>   struct bpf_btf_info info;
>   u32 info_copy, btf_copy;
>   void __user *ubtf;
> - u32 uinfo_len;
> + char __user *uname;
> + u32 uinfo_len, uname_len, name_len;
>  
>   uinfo = u64_to_user_ptr(attr->info.info);
>   uinfo_len = attr->info.info_len;
> @@ -5571,6 +5581,31 @@ int btf_get_info_by_fd(const struct btf *btf,
>   return -EFAULT;
>   info.btf_size = btf->data_size;
>  
> + info.kernel_btf = btf->kernel_btf;
> +
> + uname = u64_to_user_ptr(info.name);
> + uname_len = info.name_len;
> + if (!uname ^ !uname_len)
> + return -EINVAL;
> +
> + name_len = strlen(btf->name);
> + info.name_len = name_len;
> +
> + if (uname) {
> + if (uname_len >= name_len + 1) {
> + if (copy_to_user(uname, btf->name, name_len + 1))
> + return -EFAULT;
> + } else {
> + char zero = '\0';
> +
> + if (copy_to_user(uname, btf->name, uname_len - 1))
> + return -EFAULT;
> + if (put_user(zero, uname + uname_len - 1))
> + return -EFAULT;
> + return -ENOSPC;
It should still do copy_to_user() even it will return -ENOSPC.

> + }
> + }
> +
>   if (copy_to_user(uinfo, , info_copy) ||
>   put_user(info_copy, >info.info_len))
>   return -EFAULT;


Re: [PATCH bpf-next v5 5/9] bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 10:58:23PM +, KP Singh wrote:
> From: KP Singh 
> 
> The currently available bpf_get_current_task returns an unsigned integer
> which can be used along with BPF_CORE_READ to read data from
> the task_struct but still cannot be used as an input argument to a
> helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct.
> 
> In order to implement this helper a new return type, RET_PTR_TO_BTF_ID,
> is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not
> require checking the nullness of returned pointer.
> 
> Acked-by: Song Liu 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v5 8/9] bpf: Add tests for task_local_storage

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 10:58:26PM +, KP Singh wrote:
> From: KP Singh 
> 
> The test exercises the syscall based map operations by creating a pidfd
> for the current process.
> 
> For verifying kernel / LSM functionality, the test implements a simple
> MAC policy which denies an executable from unlinking itself. The LSM
> program bprm_committed_creds sets a task_local_storage with a pointer to
> the inode. This is then used to detect if the task is trying to unlink
> itself in the inode_unlink LSM hook.
> 
> The test copies /bin/rm to /tmp and executes it in a child thread with
> the intention of deleting itself. A successful test should prevent the
> the running executable from deleting itself.
> 
> The bpf programs are also updated to call bpf_spin_{lock, unlock} to
> trigger the verfier checks for spin locks.
> 
> The temporary file is cleaned up later in the test.
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v4 9/9] bpf: Exercise syscall operations for inode and sk storage

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:55PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Use the check_syscall_operations added for task_local_storage to
> exercise syscall operations for other local storage maps:
> 
> * Check the absence of an element for the given fd.
> * Create a new element, retrieve and compare its value.
> * Delete the element and check again for absence.
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v4 8/9] bpf: Add tests for task_local_storage

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:54PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> The test exercises the syscall based map operations by creating a pidfd
> for the current process.
> 
> For verifying kernel / LSM functionality, the test implements a simple
> MAC policy which denies an executable from unlinking itself. The LSM
> program bprm_committed_creds sets a task_local_storage with a pointer to
> the inode. This is then used to detect if the task is trying to unlink
> itself in the inode_unlink LSM hook.
> 
> The test copies /bin/rm to /tmp and executes it in a child thread with
> the intention of deleting itself. A successful test should prevent the
> the running executable from deleting itself.
> 
> The bpf programs are also updated to call bpf_spin_{lock, unlock} to
> trigger the verfier checks for spin locks.
> 
> The temporary file is cleaned up later in the test.
> 
> Signed-off-by: KP Singh 
> ---
>  .../bpf/prog_tests/test_local_storage.c   | 167 --
>  .../selftests/bpf/progs/local_storage.c   |  61 ++-
>  2 files changed, 210 insertions(+), 18 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c 
> b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
> index 91cd6f357246..feba23f8848b 100644
> --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
> +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
> @@ -4,30 +4,149 @@
>   * Copyright (C) 2020 Google LLC.
>   */
>  
> +#define _GNU_SOURCE
> +
> +#include 
> +#include 
> +#include 
>  #include 
>  #include 
>  
>  #include "local_storage.skel.h"
>  #include "network_helpers.h"
>  
> -int create_and_unlink_file(void)
> +static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
> +{
> + return syscall(__NR_pidfd_open, pid, flags);
> +}
> +
> +unsigned int duration;
static

> +
> +#define TEST_STORAGE_VALUE 0xbeefdead
> +
> +struct storage {
> + void *inode;
> + unsigned int value;
> + /* Lock ensures that spin locked versions of local stoage operations
> +  * also work, most operations in this tests are still single threaded
> +  */
> + struct bpf_spin_lock lock;
> +};
> +
> +/* Copies an rm binary to a temp file. dest is a mkstemp template */
> +int copy_rm(char *dest)
static

>  {
> - char fname[PATH_MAX] = "/tmp/fileXX";
> - int fd;
> + int ret, fd_in, fd_out;
> + struct stat stat;
>  
> - fd = mkstemp(fname);
> - if (fd < 0)
> - return fd;
> + fd_in = open("/bin/rm", O_RDONLY);
> + if (fd_in < 0)
> + return fd_in;
>  
> - close(fd);
> - unlink(fname);
> + fd_out = mkstemp(dest);
> + if (fd_out < 0)
> + return fd_out;
> +
> + ret = fstat(fd_in, );
> + if (ret == -1)
> + return errno;
> +
> + ret = copy_file_range(fd_in, NULL, fd_out, NULL, stat.st_size, 0);
> + if (ret == -1)
> + return errno;
> +
> + /* Set executable permission on the copied file */
> + ret = chmod(dest, 0100);
> + if (ret == -1)
> + return errno;
> +
> + close(fd_in);
> + close(fd_out);
fd_in and fd_out are not closed in error cases.

>   return 0;
>  }
>  
> +/* Fork and exec the provided rm binary and return the exit code of the
> + * forked process and its pid.
> + */
> +int run_self_unlink(int *monitored_pid, const char *rm_path)
static

[ ... ]

> +bool check_syscall_operations(int map_fd, int obj_fd)
static

[ ... ]

>  void test_test_local_storage(void)
>  {
> + char tmp_exec_path[PATH_MAX] = "/tmp/copy_of_rmXX";
> + int err, serv_sk = -1, task_fd = -1;
>   struct local_storage *skel = NULL;
> - int err, duration = 0, serv_sk = -1;
>  
>   skel = local_storage__open_and_load();
>   if (CHECK(!skel, "skel_load", "lsm skeleton failed\n"))
> @@ -37,10 +156,35 @@ void test_test_local_storage(void)
>   if (CHECK(err, "attach", "lsm attach failed: %d\n", err))
>   goto close_prog;
>  
> + task_fd = sys_pidfd_open(getpid(), 0);
> + if (CHECK(task_fd < 0, "pidfd_open",
> +   "failed to get pidfd err:%d, errno:%d", task_fd, errno))
> + goto close_prog;
> +
> + if (!check_syscall_operations(bpf_map__fd(skel->maps.task_storage_map),
> +   task_fd))
> + goto close_prog;
> +
> + err = copy_rm(tmp_exec_path);
> + if (CHECK(err < 0, "copy_rm", "err %d errno %d\n", err, errno))
> + goto close_prog;
> +
> + /* Sets skel->bss->monitored_pid to the pid of the forked child
> +  * forks a child process that executes tmp_exec_path and tries to
> +  * unlink its executable. This operation should be denied by the loaded
> +  * LSM program.
> +  */
> + err = run_self_unlink(>bss->monitored_pid, tmp_exec_path);
> + if (CHECK(err != EPERM, "run_self_unlink", "err %d want EPERM\n", err))
> + goto close_prog;
> +
> +

Re: [PATCH bpf-next v4 5/9] bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:51PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> The currently available bpf_get_current_task returns an unsigned integer
> which can be used along with BPF_CORE_READ to read data from
> the task_struct but still cannot be used as an input argument to a
> helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct.
> 
> In order to implement this helper a new return type, RET_PTR_TO_BTF_ID,
> is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not
> require checking the nullness of returned pointer.
> 
> Acked-by: Song Liu 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v4 4/9] bpftool: Add support for task local storage

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:50PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Updates the binary to handle the BPF_MAP_TYPE_TASK_STORAGE as
> "task_storage" for printing and parsing. Also updates the documentation
> and bash completion
> 
> Acked-by: Song Liu 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v4 2/9] bpf: Implement task local storage

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:48PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Similar to bpf_local_storage for sockets and inodes add local storage
> for task_struct.
> 
> The life-cycle of storage is managed with the life-cycle of the
> task_struct.  i.e. the storage is destroyed along with the owning task
> with a callback to the bpf_task_storage_free from the task_free LSM
> hook.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in
> the security blob which are now stackable and can co-exist with other
> LSMs.
> 
> The userspace map operations can be done by using a pid fd as a key
> passed to the lookup, update and delete operations.
> 
> Acked-by: Song Liu 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v4 1/9] bpf: Allow LSM programs to use bpf spin locks

2020-11-05 Thread Martin KaFai Lau
On Thu, Nov 05, 2020 at 03:47:47PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Usage of spin locks was not allowed for tracing programs due to
> insufficient preemption checks. The verifier does not currently prevent
> LSM programs from using spin locks, but the helpers are not exposed
> via bpf_lsm_func_proto.
> 
> Based on the discussion in [1], non-sleepable LSM programs should be
> able to use bpf_spin_{lock, unlock}.
> 
> Sleepable LSM programs can be preempted which means that allowng spin
> locks will need more work (disabling preemption and the verifier
> ensuring that no sleepable helpers are called when a spin lock is held).
> 
> [1]: 
> https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsi...@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v3 5/9] bpf: Allow LSM programs to use bpf spin locks

2020-11-04 Thread Martin KaFai Lau
On Wed, Nov 04, 2020 at 05:44:49PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Usage of spin locks was not allowed for tracing programs due to
> insufficient preemption checks. The verifier does not currently prevent
> LSM programs from using spin locks, but the helpers are not exposed
> via bpf_lsm_func_proto.
This could be the first patch but don't feel strongly about it.

> 
> Based on the discussion in [1], non-sleepable LSM programs should be
> able to use bpf_spin_{lock, unlock}.
> 
> Sleepable LSM programs can be preempted which means that allowng spin
> locks will need more work (disabling preemption and the verifier
> ensuring that no sleepable helpers are called when a spin lock is held).
> 
> [1]: 
> https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsi...@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb
> 
> Signed-off-by: KP Singh 
> ---
>  kernel/bpf/bpf_lsm.c  |  4 
>  kernel/bpf/verifier.c | 17 +
>  2 files changed, 21 insertions(+)
> 
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 61f8cc52fd5b..93383df2140b 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct 
> bpf_prog *prog)
>   return _task_storage_get_proto;
>   case BPF_FUNC_task_storage_delete:
>   return _task_storage_delete_proto;
> + case BPF_FUNC_spin_lock:
> + return _spin_lock_proto;
> + case BPF_FUNC_spin_unlock:
> + return _spin_unlock_proto;
>   default:
>   return tracing_prog_func_proto(func_id, prog);
>   }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 314018e8fc12..7c6c246077cf 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9739,6 +9739,23 @@ static int check_map_prog_compatibility(struct 
> bpf_verifier_env *env,
>   return -EINVAL;
>   }
>  
> + if (map_value_has_spin_lock(map)) {
> + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
> + verbose(env, "socket filter progs cannot use 
> bpf_spin_lock yet\n");
> + return -EINVAL;
> + }
> +
> + if (is_tracing_prog_type(prog_type)) {
> + verbose(env, "tracing progs cannot use bpf_spin_lock 
> yet\n");
> + return -EINVAL;
> + }
It is good to have a more specific verifier log.  However,
these are duplicated checks (a few lines above in the same function).
They should at least be removed.

> +
> + if (prog->aux->sleepable) {
> + verbose(env, "sleepable progs cannot use bpf_spin_lock 
> yet\n");
> + return -EINVAL;
> + }
> + }
> +
>   if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
>   !bpf_offload_prog_map_match(prog, map)) {
>   verbose(env, "offload device mismatch between prog and map\n");
> -- 
> 2.29.1.341.ge80a0c044ae-goog
> 


Re: [PATCH bpf-next v3 2/9] libbpf: Add support for task local storage

2020-11-04 Thread Martin KaFai Lau
On Wed, Nov 04, 2020 at 05:44:46PM +0100, KP Singh wrote:
> From: KP Singh 
> 
> Updates the bpf_probe_map_type API to also support
> BPF_MAP_TYPE_TASK_STORAGE similar to other local storage maps.
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v3 1/9] bpf: Implement task local storage

2020-11-04 Thread Martin KaFai Lau
Please ignore this reply which has missed some recipients.

On Wed, Nov 04, 2020 at 02:08:14PM -0800, Martin KaFai Lau wrote:
> On Wed, Nov 04, 2020 at 05:44:45PM +0100, KP Singh wrote:
> [ ... ]
> 
> > +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void 
> > *key)
> > +{
> > +   struct bpf_local_storage_data *sdata;
> > +   struct task_struct *task;
> > +   unsigned int f_flags;
> > +   struct pid *pid;
> > +   int fd, err;
> > +
> > +   fd = *(int *)key;
> > +   pid = pidfd_get_pid(fd, _flags);
> > +   if (IS_ERR(pid))
> > +   return ERR_CAST(pid);
> > +
> > +   /* We should be in an RCU read side critical section, it should be safe
> > +* to call pid_task.
> > +*/
> > +   WARN_ON_ONCE(!rcu_read_lock_held());
> > +   task = pid_task(pid, PIDTYPE_PID);
> > +   if (!task) {
> > +   err = -ENOENT;
> > +   goto out;
> > +   }
> > +
> > +   sdata = task_storage_lookup(task, map, true);
> > +   put_pid(pid);
> > +   return sdata ? sdata->data : NULL;
> > +out:
> > +   put_pid(pid);
> > +   return ERR_PTR(err);
> > +}
> > +
> > +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> > +   void *value, u64 map_flags)
> > +{
> > +   struct bpf_local_storage_data *sdata;
> > +   struct task_struct *task;
> > +   unsigned int f_flags;
> > +   struct pid *pid;
> > +   int fd, err;
> > +
> > +   fd = *(int *)key;
> > +   pid = pidfd_get_pid(fd, _flags);
> > +   if (IS_ERR(pid))
> > +   return PTR_ERR(pid);
> > +
> > +   /* We should be in an RCU read side critical section, it should be safe
> > +* to call pid_task.
> > +*/
> > +   WARN_ON_ONCE(!rcu_read_lock_held());
> > +   task = pid_task(pid, PIDTYPE_PID);
> > +   if (!task) {
> > +   err = -ENOENT;
> > +   goto out;
> > +   }
> > +
> > +   sdata = bpf_local_storage_update(
> > +   task, (struct bpf_local_storage_map *)map, value, map_flags);
> It seems the task is protected by rcu here and the task may be going away.
> Is it ok?
> 
> or the following comment in the later "BPF_CALL_4(bpf_task_storage_get, ...)"
> is no longer valid?
>   /* This helper must only called from where the task is guaranteed
>* to have a refcount and cannot be freed.
>*/
> 
> > +
> > +   err = PTR_ERR_OR_ZERO(sdata);
> > +out:
> > +   put_pid(pid);
> > +   return err;
> > +}
> > +
> 
> [ ... ]
> 
> > +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct 
> > *,
> > +  task, void *, value, u64, flags)
> > +{
> > +   struct bpf_local_storage_data *sdata;
> > +
> > +   if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> > +   return (unsigned long)NULL;
> > +
> > +   /* explicitly check that the task_storage_ptr is not
> > +* NULL as task_storage_lookup returns NULL in this case and
> > +* bpf_local_storage_update expects the owner to have a
> > +* valid storage pointer.
> > +*/
> > +   if (!task_storage_ptr(task))
> > +   return (unsigned long)NULL;
> > +
> > +   sdata = task_storage_lookup(task, map, true);
> > +   if (sdata)
> > +   return (unsigned long)sdata->data;
> > +
> > +   /* This helper must only called from where the task is guaranteed
> > +* to have a refcount and cannot be freed.
> > +*/
> > +   if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> > +   sdata = bpf_local_storage_update(
> > +   task, (struct bpf_local_storage_map *)map, value,
> > +   BPF_NOEXIST);
> > +   return IS_ERR(sdata) ? (unsigned long)NULL :
> > +(unsigned long)sdata->data;
> > +   }
> > +
> > +   return (unsigned long)NULL;
> > +}
> > +
> 
> [ ... ]
> 
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index 8f50c9c19f1b..f3fe9f53f93c 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const 
> > struct btf *btf,
> > map->map_type != BPF_MAP_TYPE_ARRAY &&
> > map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
> > map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
> > -   map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
> > +   map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
> > +   map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
> This is to enable spin lock support in a map's value.  Without peeking
> patch 5, I was confused a bit here.  It seems patch 5 was missed when
> inode storage was added.
> 
> > return -ENOTSUPP;
> > if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
> > map->value_size) {


Re: [PATCH bpf-next v3 1/9] bpf: Implement task local storage

2020-11-04 Thread Martin KaFai Lau
On Wed, Nov 04, 2020 at 05:44:45PM +0100, KP Singh wrote:
[ ... ]

> +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct task_struct *task;
> + unsigned int f_flags;
> + struct pid *pid;
> + int fd, err;
> +
> + fd = *(int *)key;
> + pid = pidfd_get_pid(fd, _flags);
> + if (IS_ERR(pid))
> + return ERR_CAST(pid);
> +
> + /* We should be in an RCU read side critical section, it should be safe
> +  * to call pid_task.
> +  */
> + WARN_ON_ONCE(!rcu_read_lock_held());
> + task = pid_task(pid, PIDTYPE_PID);
> + if (!task) {
> + err = -ENOENT;
> + goto out;
> + }
> +
> + sdata = task_storage_lookup(task, map, true);
> + put_pid(pid);
> + return sdata ? sdata->data : NULL;
> +out:
> + put_pid(pid);
> + return ERR_PTR(err);
> +}
> +
> +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct task_struct *task;
> + unsigned int f_flags;
> + struct pid *pid;
> + int fd, err;
> +
> + fd = *(int *)key;
> + pid = pidfd_get_pid(fd, _flags);
> + if (IS_ERR(pid))
> + return PTR_ERR(pid);
> +
> + /* We should be in an RCU read side critical section, it should be safe
> +  * to call pid_task.
> +  */
> + WARN_ON_ONCE(!rcu_read_lock_held());
> + task = pid_task(pid, PIDTYPE_PID);
> + if (!task) {
> + err = -ENOENT;
> + goto out;
> + }
> +
> + sdata = bpf_local_storage_update(
> + task, (struct bpf_local_storage_map *)map, value, map_flags);
It seems the task is protected by rcu here and the task may be going away.
Is it ok?

or the following comment in the later "BPF_CALL_4(bpf_task_storage_get, ...)"
is no longer valid?
/* This helper must only called from where the task is guaranteed
 * to have a refcount and cannot be freed.
 */
 
> +
> + err = PTR_ERR_OR_ZERO(sdata);
> +out:
> + put_pid(pid);
> + return err;
> +}
> +

[ ... ]

> +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> +task, void *, value, u64, flags)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> + return (unsigned long)NULL;
> +
> + /* explicitly check that the task_storage_ptr is not
> +  * NULL as task_storage_lookup returns NULL in this case and
> +  * bpf_local_storage_update expects the owner to have a
> +  * valid storage pointer.
> +  */
> + if (!task_storage_ptr(task))
> + return (unsigned long)NULL;
> +
> + sdata = task_storage_lookup(task, map, true);
> + if (sdata)
> + return (unsigned long)sdata->data;
> +
> + /* This helper must only called from where the task is guaranteed
> +  * to have a refcount and cannot be freed.
> +  */
> + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> + sdata = bpf_local_storage_update(
> + task, (struct bpf_local_storage_map *)map, value,
> + BPF_NOEXIST);
> + return IS_ERR(sdata) ? (unsigned long)NULL :
> +  (unsigned long)sdata->data;
> + }
> +
> + return (unsigned long)NULL;
> +}

[ ... ]

> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8f50c9c19f1b..f3fe9f53f93c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const 
> struct btf *btf,
>   map->map_type != BPF_MAP_TYPE_ARRAY &&
>   map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
>   map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
> - map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
> + map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
> + map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
This is to enable spin lock support in a map's value.  Without peeking
patch 5, I was confused a bit here.  It seems patch 5 was missed when
inode storage was added.

>   return -ENOTSUPP;
>   if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
>   map->value_size) {


Re: [PATCH bpf-next v3 1/9] bpf: Implement task local storage

2020-11-04 Thread Martin KaFai Lau
On Wed, Nov 04, 2020 at 05:44:45PM +0100, KP Singh wrote:
[ ... ]

> +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct task_struct *task;
> + unsigned int f_flags;
> + struct pid *pid;
> + int fd, err;
> +
> + fd = *(int *)key;
> + pid = pidfd_get_pid(fd, _flags);
> + if (IS_ERR(pid))
> + return ERR_CAST(pid);
> +
> + /* We should be in an RCU read side critical section, it should be safe
> +  * to call pid_task.
> +  */
> + WARN_ON_ONCE(!rcu_read_lock_held());
> + task = pid_task(pid, PIDTYPE_PID);
> + if (!task) {
> + err = -ENOENT;
> + goto out;
> + }
> +
> + sdata = task_storage_lookup(task, map, true);
> + put_pid(pid);
> + return sdata ? sdata->data : NULL;
> +out:
> + put_pid(pid);
> + return ERR_PTR(err);
> +}
> +
> +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct task_struct *task;
> + unsigned int f_flags;
> + struct pid *pid;
> + int fd, err;
> +
> + fd = *(int *)key;
> + pid = pidfd_get_pid(fd, _flags);
> + if (IS_ERR(pid))
> + return PTR_ERR(pid);
> +
> + /* We should be in an RCU read side critical section, it should be safe
> +  * to call pid_task.
> +  */
> + WARN_ON_ONCE(!rcu_read_lock_held());
> + task = pid_task(pid, PIDTYPE_PID);
> + if (!task) {
> + err = -ENOENT;
> + goto out;
> + }
> +
> + sdata = bpf_local_storage_update(
> + task, (struct bpf_local_storage_map *)map, value, map_flags);
It seems the task is protected by rcu here and the task may be going away.
Is it ok?

or the following comment in the later "BPF_CALL_4(bpf_task_storage_get, ...)"
is no longer valid?
/* This helper must only called from where the task is guaranteed
 * to have a refcount and cannot be freed.
 */

> +
> + err = PTR_ERR_OR_ZERO(sdata);
> +out:
> + put_pid(pid);
> + return err;
> +}
> +

[ ... ]

> +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> +task, void *, value, u64, flags)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> + return (unsigned long)NULL;
> +
> + /* explicitly check that the task_storage_ptr is not
> +  * NULL as task_storage_lookup returns NULL in this case and
> +  * bpf_local_storage_update expects the owner to have a
> +  * valid storage pointer.
> +  */
> + if (!task_storage_ptr(task))
> + return (unsigned long)NULL;
> +
> + sdata = task_storage_lookup(task, map, true);
> + if (sdata)
> + return (unsigned long)sdata->data;
> +
> + /* This helper must only called from where the task is guaranteed
> +  * to have a refcount and cannot be freed.
> +  */
> + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> + sdata = bpf_local_storage_update(
> + task, (struct bpf_local_storage_map *)map, value,
> + BPF_NOEXIST);
> + return IS_ERR(sdata) ? (unsigned long)NULL :
> +  (unsigned long)sdata->data;
> + }
> +
> + return (unsigned long)NULL;
> +}
> +

[ ... ]

> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8f50c9c19f1b..f3fe9f53f93c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const 
> struct btf *btf,
>   map->map_type != BPF_MAP_TYPE_ARRAY &&
>   map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
>   map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
> - map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
> + map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
> + map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
This is to enable spin lock support in a map's value.  Without peeking
patch 5, I was confused a bit here.  It seems patch 5 was missed when
inode storage was added.

>   return -ENOTSUPP;
>   if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
>   map->value_size) {


Re: [PATCH bpf-next 1/5] bpf: Implement task local storage

2020-10-28 Thread Martin KaFai Lau
On Tue, Oct 27, 2020 at 06:03:13PM +0100, KP Singh wrote:
[ ... ] 

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6ceac3f7d62..bb443c4f3637 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -157,6 +157,7 @@ enum bpf_map_type {
>   BPF_MAP_TYPE_STRUCT_OPS,
>   BPF_MAP_TYPE_RINGBUF,
>   BPF_MAP_TYPE_INODE_STORAGE,
> + BPF_MAP_TYPE_TASK_STORAGE,
>  };
>  
>  /* Note that tracing related programs such as
> @@ -3742,6 +3743,42 @@ union bpf_attr {
>   *   Return
>   *   The helper returns **TC_ACT_REDIRECT** on success or
>   *   **TC_ACT_SHOT** on error.
> + *
> + * void *bpf_task_storage_get(struct bpf_map *map, void *task, void *value, 
> u64 flags)
After peeking patch 2,  I think the pointer type should be
"struct task_struct *task" instead of "void *task".

Same for bpf_task_storage_delete().

> + *   Description
> + *   Get a bpf_local_storage from the *task*.
> + *
> + *   Logically, it could be thought of as getting the value from
> + *   a *map* with *task* as the **key**.  From this
> + *   perspective,  the usage is not much different from
> + *   **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
> + *   helper enforces the key must be an task_struct and the map must 
> also
> + *   be a **BPF_MAP_TYPE_TASK_STORAGE**.
> + *
> + *   Underneath, the value is stored locally at *task* instead of
> + *   the *map*.  The *map* is used as the bpf-local-storage
> + *   "type". The bpf-local-storage "type" (i.e. the *map*) is
> + *   searched against all bpf_local_storage residing at *task*.
> + *
> + *   An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
> + *   used such that a new bpf_local_storage will be
> + *   created if one does not exist.  *value* can be used
> + *   together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
> + *   the initial value of a bpf_local_storage.  If *value* is
> + *   **NULL**, the new bpf_local_storage will be zero initialized.
> + *   Return
> + *   A bpf_local_storage pointer is returned on success.
> + *
> + *   **NULL** if not found or there was an error in adding
> + *   a new bpf_local_storage.
> + *
> + * int bpf_task_storage_delete(struct bpf_map *map, void *task)
> + *   Description
> + *   Delete a bpf_local_storage from a *task*.
> + *   Return
> + *   0 on success.
> + *
> + *   **-ENOENT** if the bpf_local_storage cannot be found.
>   */


Re: [PATCH bpf-next 1/5] bpf: Implement task local storage

2020-10-28 Thread Martin KaFai Lau
On Tue, Oct 27, 2020 at 06:03:13PM +0100, KP Singh wrote:
[ ... ]

> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> new file mode 100644
> index ..774140c458cc
> --- /dev/null
> +++ b/kernel/bpf/bpf_task_storage.c
> @@ -0,0 +1,327 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2019 Facebook
> + * Copyright 2020 Google LLC.
> + */
> +
> +#include "linux/pid.h"
> +#include "linux/sched.h"
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
Is this required?

> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +DEFINE_BPF_STORAGE_CACHE(task_cache);
> +
> +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
> +{
> + struct task_struct *task = owner;
> + struct bpf_storage_blob *bsb;
> +
> + bsb = bpf_task(task);
> + if (!bsb)
> + return NULL;
> + return >storage;
> +}
> +
> +static struct bpf_local_storage_data *
> +task_storage_lookup(struct task_struct *task, struct bpf_map *map,
> + bool cacheit_lockit)
> +{
> + struct bpf_local_storage *task_storage;
> + struct bpf_local_storage_map *smap;
> + struct bpf_storage_blob *bsb;
> +
> + bsb = bpf_task(task);
> + if (!bsb)
> + return NULL;
> +
> + task_storage = rcu_dereference(bsb->storage);
> + if (!task_storage)
> + return NULL;
> +
> + smap = (struct bpf_local_storage_map *)map;
> + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
> +}
> +

[ ... ]

> +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct task_struct *task;
> + struct pid *pid;
> + struct file *f;
> + int fd, err;
> +
> + fd = *(int *)key;
> + f = fget_raw(fd);
> + if (!f)
> + return ERR_PTR(-EBADF);
> +
> + if (f->f_op != _fops) {
> + err = -EBADF;
> + goto out_fput;
> + }
> +
> + pid = get_pid(f->private_data);
n00b question. Is get_pid(f->private_data) required?
f->private_data could be freed while holding f->f_count?

> + task = get_pid_task(pid, PIDTYPE_PID);
Should put_task_struct() be called before returning?

> + if (!task || !task_storage_ptr(task)) {
"!task_storage_ptr(task)" is unnecessary, task_storage_lookup() should
have taken care of it.


> + err = -ENOENT;
> + goto out;
> + }
> +
> + sdata = task_storage_lookup(task, map, true);
> + put_pid(pid);
> + return sdata ? sdata->data : NULL;
> +out:
> + put_pid(pid);
> +out_fput:
> + fput(f);
> + return ERR_PTR(err);
> +}
> +
[ ... ]

> +static int task_storage_map_btf_id;
> +const struct bpf_map_ops task_storage_map_ops = {
> + .map_meta_equal = bpf_map_meta_equal,
> + .map_alloc_check = bpf_local_storage_map_alloc_check,
> + .map_alloc = task_storage_map_alloc,
> + .map_free = task_storage_map_free,
> + .map_get_next_key = notsupp_get_next_key,
> + .map_lookup_elem = bpf_pid_task_storage_lookup_elem,
> + .map_update_elem = bpf_pid_task_storage_update_elem,
> + .map_delete_elem = bpf_pid_task_storage_delete_elem,
Please exercise the syscall use cases also in the selftest.

> + .map_check_btf = bpf_local_storage_map_check_btf,
> + .map_btf_name = "bpf_local_storage_map",
> + .map_btf_id = _storage_map_btf_id,
> + .map_owner_storage_ptr = task_storage_ptr,
> +};
> +


Re: [PATCH bpf-next 2/5] bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID

2020-10-28 Thread Martin KaFai Lau
On Tue, Oct 27, 2020 at 06:03:14PM +0100, KP Singh wrote:
[ ... ]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index b0790876694f..eb0aef85fc11 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -493,7 +493,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
>   func_id == BPF_FUNC_skc_to_tcp6_sock ||
>   func_id == BPF_FUNC_skc_to_udp6_sock ||
>   func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
> - func_id == BPF_FUNC_skc_to_tcp_request_sock;
> + func_id == BPF_FUNC_skc_to_tcp_request_sock ||
> + func_id == BPF_FUNC_get_current_task_btf;
This change is not needed.  The helper does not take any
argument or do any type casting.



Re: [PATCH bpf-next v2 4/4] selftest: bpf: Test copying a sockmap and sockhash

2020-09-29 Thread Martin KaFai Lau
On Tue, Sep 29, 2020 at 10:21:05AM +0100, Lorenz Bauer wrote:
> On Tue, 29 Sep 2020 at 07:06, Martin KaFai Lau  wrote:
> 
> ...
> 
> > > + /* We need a temporary buffer on the stack, since the verifier 
> > > doesn't
> > > +  * let us use the pointer from the context as an argument to the 
> > > helper.
> > Is it something that can be improved later?
> >
> > others LGTM.
> 
> Yeah, I think so. We'd need to do something similar to your
> sock_common work for PTR_TO_RDONLY_BUF_OR_NULL. The fact that the
> pointer is read only makes it a bit more difficult I think. After
I thought the key arg should be used as read-only in the map's helper.
or there is map type's helper that modifies the key?

> that, a user could just plug the key into map_update_elem directly.
> Alternatively, allow specialising map_ops per context.


Re: [PATCH bpf-next v2 4/4] selftest: bpf: Test copying a sockmap and sockhash

2020-09-29 Thread Martin KaFai Lau
On Mon, Sep 28, 2020 at 10:08:05AM +0100, Lorenz Bauer wrote:

[ ... ]

>  SEC("iter/sockmap")
> -int count_elems(struct bpf_iter__sockmap *ctx)
> +int copy(struct bpf_iter__sockmap *ctx)
>  {
>   struct sock *sk = ctx->sk;
>   __u32 tmp, *key = ctx->key;
>   int ret;
>  
> - if (key)
> - elems++;
> + if (!key)
> + return 0;
>  
> - if (sk)
> + elems++;
> +
> + /* We need a temporary buffer on the stack, since the verifier doesn't
> +  * let us use the pointer from the context as an argument to the helper.
Is it something that can be improved later?

others LGTM.


Re: [PATCH bpf-next v2 2/4] selftests: bpf: Add helper to compare socket cookies

2020-09-28 Thread Martin KaFai Lau
On Mon, Sep 28, 2020 at 10:08:03AM +0100, Lorenz Bauer wrote:
> We compare socket cookies to ensure that insertion into a sockmap worked.
> Pull this out into a helper function for use in other tests.
> 
> Signed-off-by: Lorenz Bauer 
> ---
>  .../selftests/bpf/prog_tests/sockmap_basic.c  | 50 +--
>  1 file changed, 36 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c 
> b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
> index 4b7a527e7e82..67d3301bdf81 100644
> --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
> +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
> @@ -50,6 +50,37 @@ static int connected_socket_v4(void)
>   return -1;
>  }
>  
> +static void compare_cookies(struct bpf_map *src, struct bpf_map *dst)
> +{
> + __u32 i, max_entries = bpf_map__max_entries(src);
> + int err, duration, src_fd, dst_fd;
This should have a compiler warning.  "duration" is not initialized.


Re: [PATCH bpf-next v2 1/4] bpf: sockmap: enable map_update_elem from bpf_iter

2020-09-28 Thread Martin KaFai Lau
On Mon, Sep 28, 2020 at 10:08:02AM +0100, Lorenz Bauer wrote:
> Allow passing a pointer to a BTF struct sock_common* when updating
> a sockmap or sockhash. Since BTF pointers can fault and therefore be
> NULL at runtime we need to add an additional !sk check to
> sock_map_update_elem. Since we may be passed a request or timewait
> socket we also need to check sk_fullsock. Doing this allows calling
> map_update_elem on sockmap from bpf_iter context, which uses
> BTF pointers.
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next 1/4] bpf: sockmap: enable map_update_elem from bpf_iter

2020-09-25 Thread Martin KaFai Lau
On Fri, Sep 25, 2020 at 10:56:27AM +0100, Lorenz Bauer wrote:
[ ... ]

> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index e1f05e3fa1d0..497e7df466d4 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -610,6 +610,9 @@ static int sock_map_update_elem(struct bpf_map *map, void 
> *key,
>   struct sock *sk = (struct sock *)value;
>   int ret;
>  
> + if (unlikely(!sk))
sk_fullsock(sk) test is also needed.

> + return -EINVAL;

> +
>   if (!sock_map_sk_is_suitable(sk))
sk->sk_type is used in sock_map_sk_is_suitable().
sk_type is not in sock_common.


Re: [PATCH bpf-next v9 5/7] bpf: Implement bpf_local_storage for inodes

2020-08-25 Thread Martin KaFai Lau
On Tue, Aug 25, 2020 at 04:10:26PM +0200, KP Singh wrote:
> 
> 
> On 8/25/20 2:52 AM, Martin KaFai Lau wrote:
> > On Sun, Aug 23, 2020 at 06:56:10PM +0200, KP Singh wrote:
> >> From: KP Singh 
> >>
> >> Similar to bpf_local_storage for sockets, add local storage for inodes.
> >> The life-cycle of storage is managed with the life-cycle of the inode.
> >> i.e. the storage is destroyed along with the owning inode.
> >>
> >> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> >> security blob which are now stackable and can co-exist with other LSMs.
> >>
> > [ ... ]
> > 
> >> diff --git a/kernel/bpf/bpf_inode_storage.c 
> >> b/kernel/bpf/bpf_inode_storage.c
> >> new file mode 100644
> >> index ..b0b283c224c1
> >> --- /dev/null
> >> +++ b/kernel/bpf/bpf_inode_storage.c
> 
> [...]
> 
> >> +
> >> +DEFINE_BPF_STORAGE_CACHE(inode_cache);
> >> +
> >> +static struct bpf_local_storage __rcu **
> >> +inode_storage_ptr(void *owner)
> >> +{
> >> +  struct inode *inode = owner;
> >> +  struct bpf_storage_blob *bsb;
> >> +
> >> +  bsb = bpf_inode(inode);
> >> +  if (!bsb)
> >> +  return NULL;
> > just noticed this one.  NULL could be returned here.  When will it happen?
> 
> This can happen if CONFIG_BPF_LSM is enabled but "bpf" is not in the list of
> active LSMs.
> 
> > 
> >> +  return >storage;
> >> +}
> >> +
> >> +static struct bpf_local_storage_data *inode_storage_lookup(struct inode 
> >> *inode,
> >> + struct bpf_map *map,
> >> + bool cacheit_lockit)
> >> +{
> 
> [...]
> 
> > path first before calling the bpf_local_storage_update() since
> > this case is specific to inode local storage.
> > 
> > Same for the other bpf_local_storage_update() cases.
> 
> If you're okay with this I can send a new series with the following updates.
> 
> diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> index b0b283c224c1..74546cee814d 100644
> --- a/kernel/bpf/bpf_inode_storage.c
> +++ b/kernel/bpf/bpf_inode_storage.c
> @@ -125,7 +125,7 @@ static int bpf_fd_inode_storage_update_elem(struct 
> bpf_map *map, void *key,
>  
> fd = *(int *)key;
> f = fget_raw(fd);
> -   if (!f)
> +   if (!f || !inode_storage_ptr(f->f_inode))
> return -EBADF;
>  
> sdata = bpf_local_storage_update(f->f_inode,
> @@ -171,6 +171,14 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, 
> struct inode *, inode,
> if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> return (unsigned long)NULL;
>  
> +   /* explicitly check that the inode_storage_ptr is not
> +* NULL as inode_storage_lookup returns NULL in this case and
> +* and bpf_local_storage_update expects the owner to have a
> +* valid storage pointer.
> +*/
> +   if (!inode_storage_ptr(inode))
> +   return (unsigned long)NULL;
LGTM


Re: [PATCH bpf-next v9 4/7] bpf: Split bpf_local_storage to bpf_sk_storage

2020-08-24 Thread Martin KaFai Lau
On Sun, Aug 23, 2020 at 06:56:09PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> A purely mechanical change:
> 
>   bpf_sk_storage.c = bpf_sk_storage.c + bpf_local_storage.c
>   bpf_sk_storage.h = bpf_sk_storage.h + bpf_local_storage.h
> 
> Signed-off-by: KP Singh 
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v9 3/7] bpf: Generalize bpf_sk_storage

2020-08-24 Thread Martin KaFai Lau
On Sun, Aug 23, 2020 at 06:56:08PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Refactor the functionality in bpf_sk_storage.c so that concept of
> storage linked to kernel objects can be extended to other objects like
> inode, task_struct etc.
> 
> Each new local storage will still be a separate map and provide its own
> set of helpers. This allows for future object specific extensions and
> still share a lot of the underlying implementation.
> 
> This includes the changes suggested by Martin in:
> 
>   https://lore.kernel.org/bpf/20200725013047.4006241-1-ka...@fb.com/
> 
> adding new map operations to support bpf_local_storage maps:
> 
> * storages for different kernel objects to optionally have different
>   memory charging strategy (map_local_storage_charge,
>   map_local_storage_uncharge)
> * Functionality to extract the storage pointer from a pointer to the
>   owning object (map_owner_storage_ptr)
> 
> Co-developed-by: Martin KaFai Lau 
> Signed-off-by: KP Singh 
Signed-off-by: Martin KaFai Lau 


Re: [PATCH bpf-next v9 5/7] bpf: Implement bpf_local_storage for inodes

2020-08-24 Thread Martin KaFai Lau
On Sun, Aug 23, 2020 at 06:56:10PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Similar to bpf_local_storage for sockets, add local storage for inodes.
> The life-cycle of storage is managed with the life-cycle of the inode.
> i.e. the storage is destroyed along with the owning inode.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> security blob which are now stackable and can co-exist with other LSMs.
> 
[ ... ]

> diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> new file mode 100644
> index ..b0b283c224c1
> --- /dev/null
> +++ b/kernel/bpf/bpf_inode_storage.c
> @@ -0,0 +1,265 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2019 Facebook
> + * Copyright 2020 Google LLC.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +DEFINE_BPF_STORAGE_CACHE(inode_cache);
> +
> +static struct bpf_local_storage __rcu **
> +inode_storage_ptr(void *owner)
> +{
> + struct inode *inode = owner;
> + struct bpf_storage_blob *bsb;
> +
> + bsb = bpf_inode(inode);
> + if (!bsb)
> + return NULL;
just noticed this one.  NULL could be returned here.  When will it happen?

> + return >storage;
> +}
> +
> +static struct bpf_local_storage_data *inode_storage_lookup(struct inode 
> *inode,
> +struct bpf_map *map,
> +bool cacheit_lockit)
> +{
> + struct bpf_local_storage *inode_storage;
> + struct bpf_local_storage_map *smap;
> + struct bpf_storage_blob *bsb;
> +
> + bsb = bpf_inode(inode);
> + if (!bsb)
> + return NULL;
lookup is fine since NULL is checked here.

> +
> + inode_storage = rcu_dereference(bsb->storage);
> + if (!inode_storage)
> + return NULL;
> +
> + smap = (struct bpf_local_storage_map *)map;
> + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
> +}
> +

[ ... ]

> +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
> +  void *value, u64 map_flags)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct file *f;
> + int fd;
> +
> + fd = *(int *)key;
> + f = fget_raw(fd);
> + if (!f)
> + return -EBADF;
> +
> + sdata = bpf_local_storage_update(f->f_inode,
This will be an issue.  bpf_local_storage_update() will not check NULL
returned by inode_storage_ptr().  It should be checked here in the inode code
path first before calling the bpf_local_storage_update() since
this case is specific to inode local storage.

Same for the other bpf_local_storage_update() cases.

> +  (struct bpf_local_storage_map *)map,
> +  value, map_flags);
> + fput(f);
> + return PTR_ERR_OR_ZERO(sdata);
> +}
> +

[ ... ]

> +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, 
> inode,
> +void *, value, u64, flags)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> + return (unsigned long)NULL;
> +
> + sdata = inode_storage_lookup(inode, map, true);
> + if (sdata)
> + return (unsigned long)sdata->data;
> +
> + /* This helper must only called from where the inode is gurranteed
> +  * to have a refcount and cannot be freed.
> +  */
> + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> + sdata = bpf_local_storage_update(
> + inode, (struct bpf_local_storage_map *)map, value,
> + BPF_NOEXIST);
> + return IS_ERR(sdata) ? (unsigned long)NULL :
> +  (unsigned long)sdata->data;
> + }
> +
> + return (unsigned long)NULL;
> +}
> +

> diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
> index 32d32d485451..35f9b19259e5 100644
> --- a/security/bpf/hooks.c
> +++ b/security/bpf/hooks.c
> @@ -3,6 +3,7 @@
>  /*
>   * Copyright (C) 2020 Google LLC.
>   */
> +#include 
Is it needed?

>  #include 
>  #include 
>  
> @@ -11,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] 
> __lsm_ro_after_init = {
>   LSM_HOOK_INIT(NAME, bpf_lsm_##NAME),
>   #include 
>   #undef LSM_HOOK
> + LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free),
>  };
>  
>  static int __init bpf_lsm_init(void)
> @@ -20,7 +22,12 @@ static int __init bpf_lsm_init(void)
>   return 0;
>  }
>  
> +struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = {
> + .lbs_inode = sizeof(struct bpf_storage_blob),
> +};
> +
>  DEFINE_LSM(bpf) = {
>   .name = "bpf",
>   .init = bpf_lsm_init,
> + .blobs = _lsm_blob_sizes
>  };


Re: [PATCH bpf-next v8 3/7] bpf: Generalize bpf_sk_storage

2020-08-19 Thread Martin KaFai Lau
On Wed, Aug 19, 2020 at 02:41:50PM +0200, KP Singh wrote:
> On 8/18/20 3:05 AM, Martin KaFai Lau wrote:
> > On Mon, Aug 03, 2020 at 06:46:51PM +0200, KP Singh wrote:
> >> From: KP Singh 
> >>
> >> Refactor the functionality in bpf_sk_storage.c so that concept of
> >> storage linked to kernel objects can be extended to other objects like
> >> inode, task_struct etc.
> >>
> >> Each new local storage will still be a separate map and provide its own
> >> set of helpers. This allows for future object specific extensions and
> >> still share a lot of the underlying implementation.
> >>
> >> This includes the changes suggested by Martin in:
> >>
> >>   https://lore.kernel.org/bpf/20200725013047.4006241-1-ka...@fb.com/
> >>
> >> which adds map_local_storage_charge, map_local_storage_uncharge,
> >> and map_owner_storage_ptr.
> > A description will still be useful in the commit message to talk
> > about the new map_ops, e.g.
> > they allow kernel object to optionally have different mem-charge strategy.
> > 
> >>
> >> Co-developed-by: Martin KaFai Lau 
> >> Signed-off-by: KP Singh 
> >> ---
> >>  include/linux/bpf.h|   9 ++
> >>  include/net/bpf_sk_storage.h   |  51 +++
> >>  include/uapi/linux/bpf.h   |   8 +-
> >>  net/core/bpf_sk_storage.c  | 246 +
> >>  tools/include/uapi/linux/bpf.h |   8 +-
> >>  5 files changed, 233 insertions(+), 89 deletions(-)
> >>
> 
> >> +  struct bpf_local_storage_map *smap,
> >> +  struct bpf_local_storage_elem *first_selem);
> >> +
> >> +struct bpf_local_storage_data *
> >> +bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
> > Nit.  It may be more consistent to take "struct bpf_local_storage_map *smap"
> > instead of "struct bpf_map *map" here.
> > 
> > bpf_local_storage_map_check_btf() will be the only one taking
> > "struct bpf_map *map".
> 
> That's because it is used in map operations as map_check_btf which expects
> a bpf_map *map pointer. We can wrap it in another function but is that
> worth doing?
Agree.  bpf_local_storage_map_check_btf() should stay as is.

I meant to only change the "bpf_local_storage_update()" to take
"struct bpf_local_storage_map *smap".

> > 
> >> *
> >> * The elem of this map can be cleaned up here
> >> * or
> >> -   * by bpf_sk_storage_free() during __sk_destruct().
> >> +   * by bpf_local_storage_free() during the destruction of the
> >> +   * owner object. eg. __sk_destruct.
> > This belongs to patch 1 also.
> 
> 
> In patch, 1, changed it to:
> 
>* The elem of this map can be cleaned up here
>* or when the storage is freed e.g.
>* by bpf_sk_storage_free() during __sk_destruct().
>
+1



Re: [PATCH] bpf: Add bpf_skb_get_sock_comm() helper

2020-08-19 Thread Martin KaFai Lau
On Wed, Aug 19, 2020 at 02:10:56PM +0800, 江禹 wrote:
> Dear Martin,
> 
> 
> > One possibility is to use the "sk_bpf_storage" member immediately above
> > instead of adding "sk_task_com[]".
> > 
> > It is an extensible sk storage for bpf.  There are examples in selftests,
> > e.g tools/testing/selftests/bpf/progs/udp_limits.c which creates sk storage
> > at socket creation time.  Another hook point option could be "connect()"
> > for tcp, i.e. "cgroup/connect[46]".
> > 
> > Search "BPF_MAP_TYPE_SK_STORAGE" under the selftests/bpf for other examples.
> > 
> > It seems there is already a "bpf_get_current_comm()" helper which
> > could be used to initialize the task comm string in the bpf sk storage.
> > 
> > btw, bpf-next is still closed.
> 
> 
> I have rewrite my code according to your suggestion.  In general,it works as 
> designed.
> 
> 
> But the task comm string got from "bpf_get_current_comm()" helper is belong 
> to specific thread.
> It is not a obvious label for skb tracing. More reasonable tracing key is the 
> task comm of process
> which this skb belongs.
>  
> It seems a new bpf helper is still needed.
May be.  It is not clear to me whether it is better to account this skb
to its process or to its thread.  I think this depends on the
use-case/assumption.

If bpf_get_current_comm() does not get what you need,
then another helper may be added to get the process comm.
This new helper may be useful for other use cases also.

btw, Have your thought about tracing at the sendmsg time?
e.g. tcp_sendmsg()?


Re: [PATCH bpf-next v8 5/7] bpf: Implement bpf_local_storage for inodes

2020-08-18 Thread Martin KaFai Lau
On Tue, Aug 18, 2020 at 05:10:34PM +0200, KP Singh wrote:
> 
> 
> On 8/18/20 3:27 AM, Martin KaFai Lau wrote:
> > On Mon, Aug 03, 2020 at 06:46:53PM +0200, KP Singh wrote:
> >> From: KP Singh 
> >>
> >> Similar to bpf_local_storage for sockets, add local storage for inodes.
> >> The life-cycle of storage is managed with the life-cycle of the inode.
> >> i.e. the storage is destroyed along with the owning inode.
> >>
> >> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> >> security blob which are now stackable and can co-exist with other LSMs.
> >>
> >> Signed-off-by: KP Singh 
> >> ---
> >>  include/linux/bpf_local_storage.h |  10 +
> >>  include/linux/bpf_lsm.h   |  21 ++
> >>  include/linux/bpf_types.h |   3 +
> >>  include/uapi/linux/bpf.h  |  38 +++
> >>  kernel/bpf/Makefile   |   1 +
> 
> [...]
> 
> ata *inode_storage_lookup(struct inode *inode,
> >> + struct bpf_map *map,
> >> + bool cacheit_lockit)
> >> +{
> >> +  struct bpf_local_storage *inode_storage;
> >> +  struct bpf_local_storage_map *smap;
> >> +  struct bpf_storage_blob *bsb;
> >> +
> >> +  bsb = bpf_inode(inode);
> >> +  if (!bsb)
> >> +  return ERR_PTR(-ENOENT);
> > ERR_PTR is returned here...
> > 
> >> +
> >> +  inode_storage = rcu_dereference(bsb->storage);
> >> +  if (!inode_storage)
> >> +  return NULL;
> >> +
> 
> [...]
> 
> >> +  kfree_rcu(local_storage, rcu);
> >> +}
> >> +
> >> +
> >> +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void 
> >> *key)
> >> +{
> >> +  struct bpf_local_storage_data *sdata;
> >> +  struct file *f;
> >> +  int fd;
> >> +
> >> +  fd = *(int *)key;
> >> +  f = fcheck(fd);
> >> +  if (!f)
> >> +  return ERR_PTR(-EINVAL);
> >> +
> >> +  get_file(f);
> >> +  sdata = inode_storage_lookup(f->f_inode, map, true);
> >> +  fput(f);
> >> +  return sdata ? sdata->data : NULL;
> > sdata can be ERR_PTR here and a few other cases below.
> > 
> > May be inode_storage_lookup() should just return NULL.
> 
> I think returning NULL is a better option. Thanks!
> 
> > 
> >> +}
> >> +
> >> +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void 
> >> *key,
> >> +   void *value, u64 map_flags)
> >> +{
> >> +  struct bpf_local_storage_data *sdata;
> >> +  struct file *f;
> >> +  int fd;
> >> +
> >> +  fd = *(int *)key;
> >> +  f = fcheck(fd);
> >> +  if (!f)
> >> +  return -EINVAL;
> >> +
> >> +  get_file(f);> get_file() does atomic_long_inc() instead of 
> >> atomic_long_inc_not_zero().
> > I don't see how that helps here.  Am I missing something?
> 
> You are right, this should not not be an fcheck followed by a get_file
> rather fcheck followed by get_file_rcu:
> 
> #define get_file_rcu_many(x, cnt) \
>   atomic_long_add_unless(&(x)->f_count, (cnt), 0)
> #define get_file_rcu(x) get_file_rcu_many((x), 1)
> #define file_count(x) atomic_long_read(&(x)->f_count)
> 
> But there is an easier way than all of this and this is to use 
> fget_raw which also calls get_file_rcu_many 
> and ensures a non-zero count before getting a reference.
ic. Make sense.

There are fdget() and fdput() also which are used in bpf/syscall.c.


Re: [PATCH bpf-next v8 6/7] bpf: Allow local storage to be used from LSM programs

2020-08-17 Thread Martin KaFai Lau
On Mon, Aug 03, 2020 at 06:46:54PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used
> in LSM programs. These helpers are not used for tracing programs
> (currently) as their usage is tied to the life-cycle of the object and
> should only be used where the owning object won't be freed (when the
> owning object is passed as an argument to the LSM hook). Thus, they
> are safer to use in LSM hooks than tracing. Usage of local storage in
> tracing programs will probably follow a per function based whitelist
> approach.
> 
> Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock,
> it, leads to a compilation warning for LSM programs, it's also updated
> to accept a void * pointer instead.
> 
> Signed-off-by: KP Singh 
> ---
>  include/net/bpf_sk_storage.h   |  2 ++
>  include/uapi/linux/bpf.h   |  8 ++--
>  kernel/bpf/bpf_lsm.c   | 21 -
>  net/core/bpf_sk_storage.c  | 25 +
>  tools/include/uapi/linux/bpf.h |  8 ++--
>  5 files changed, 59 insertions(+), 5 deletions(-)
> 
> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> index 847926cf2899..c5702d7baeaa 100644
> --- a/include/net/bpf_sk_storage.h
> +++ b/include/net/bpf_sk_storage.h
> @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk);
>  
>  extern const struct bpf_func_proto bpf_sk_storage_get_proto;
>  extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> +extern const struct bpf_func_proto sk_storage_get_btf_proto;
> +extern const struct bpf_func_proto sk_storage_delete_btf_proto;
>  
>  struct bpf_sk_storage_diag;
>  struct sk_buff;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e17c00eea5d8..6ffc61dafc5c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2807,7 +2807,7 @@ union bpf_attr {
>   *
>   *   **-ERANGE** if resulting value was out of range.
>   *
> - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void 
> *value, u64 flags)
> + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 
> flags)
>   *   Description
>   *   Get a bpf-local-storage from a *sk*.
>   *
> @@ -2823,6 +2823,10 @@ union bpf_attr {
>   *   "type". The bpf-local-storage "type" (i.e. the *map*) is
>   *   searched against all bpf-local-storages residing at *sk*.
>   *
> + *   For socket programs, *sk* should be a **struct bpf_sock** 
> pointer
> + *   and an **ARG_PTR_TO_BTF_ID** of type **struct sock** for LSM
> + *   programs.
I found it a little vague on what "socket programs" is.  May be:

*sk* is a kernel **struct sock** pointer for LSM program.
*sk* is a **struct bpf_sock** pointer for other program types.

Others LGTM

Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v8 5/7] bpf: Implement bpf_local_storage for inodes

2020-08-17 Thread Martin KaFai Lau
On Mon, Aug 03, 2020 at 06:46:53PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Similar to bpf_local_storage for sockets, add local storage for inodes.
> The life-cycle of storage is managed with the life-cycle of the inode.
> i.e. the storage is destroyed along with the owning inode.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> security blob which are now stackable and can co-exist with other LSMs.
> 
> Signed-off-by: KP Singh 
> ---
>  include/linux/bpf_local_storage.h |  10 +
>  include/linux/bpf_lsm.h   |  21 ++
>  include/linux/bpf_types.h |   3 +
>  include/uapi/linux/bpf.h  |  38 +++
>  kernel/bpf/Makefile   |   1 +
>  kernel/bpf/bpf_inode_storage.c| 265 ++
>  kernel/bpf/syscall.c  |   3 +-
>  kernel/bpf/verifier.c |  10 +
>  security/bpf/hooks.c  |   7 +
>  .../bpf/bpftool/Documentation/bpftool-map.rst |   2 +-
>  tools/bpf/bpftool/bash-completion/bpftool |   3 +-
>  tools/bpf/bpftool/map.c   |   3 +-
>  tools/include/uapi/linux/bpf.h|  38 +++
>  tools/lib/bpf/libbpf_probes.c |   5 +-
>  14 files changed, 403 insertions(+), 6 deletions(-)
>  create mode 100644 kernel/bpf/bpf_inode_storage.c
> 
> diff --git a/include/linux/bpf_local_storage.h 
> b/include/linux/bpf_local_storage.h
> index 3685f681a7fc..75c76847fad5 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -160,4 +160,14 @@ struct bpf_local_storage_data *
>  bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
>u64 map_flags);
>  
> +#ifdef CONFIG_BPF_LSM
> +extern const struct bpf_func_proto bpf_inode_storage_get_proto;
> +extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> +void bpf_inode_storage_free(struct inode *inode);
> +#else
> +static inline void bpf_inode_storage_free(struct inode *inode)
> +{
> +}
> +#endif /* CONFIG_BPF_LSM */
This is LSM specific.  Can they be moved to bpf_lsm.h?

> +
>  #endif /* _BPF_LOCAL_STORAGE_H */
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index af74712af585..d0683ada1e49 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -17,9 +17,24 @@
>  #include 
>  #undef LSM_HOOK
>  
> +struct bpf_storage_blob {
> + struct bpf_local_storage __rcu *storage;
> +};
> +
> +extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
> +
>  int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
>   const struct bpf_prog *prog);
>  
> +static inline struct bpf_storage_blob *bpf_inode(
> + const struct inode *inode)
> +{
> + if (unlikely(!inode->i_security))
> + return NULL;
> +
> + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
> +}
> +
>  #else /* !CONFIG_BPF_LSM */
>  
>  static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
> @@ -28,6 +43,12 @@ static inline int bpf_lsm_verify_prog(struct 
> bpf_verifier_log *vlog,
>   return -EOPNOTSUPP;
>  }
>  
> +static inline struct bpf_storage_blob *bpf_inode(
> + const struct inode *inode)
> +{
> + return NULL;
> +}
> +
>  #endif /* CONFIG_BPF_LSM */
>  
>  #endif /* _LINUX_BPF_LSM_H */

[ ... ]

> diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> new file mode 100644
> index ..a51a6328c02d
> --- /dev/null
> +++ b/kernel/bpf/bpf_inode_storage.c
> @@ -0,0 +1,265 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2019 Facebook
> + * Copyright 2020 Google LLC.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +DEFINE_BPF_STORAGE_CACHE(inode_cache);
> +
> +static struct bpf_local_storage __rcu **
> +inode_storage_ptr(void *owner)
> +{
> + struct inode *inode = owner;
> + struct bpf_storage_blob *bsb;
> + bsb = bpf_inode(inode);
> + if (!bsb)
> + return NULL;
> + return >storage;
> +}
> +
> +static struct bpf_local_storage_data *inode_storage_lookup(struct inode 
> *inode,
> +struct bpf_map *map,
> +bool cacheit_lockit)
> +{
> + struct bpf_local_storage *inode_storage;
> + struct bpf_local_storage_map *smap;
> + struct bpf_storage_blob *bsb;
> +
> + bsb = bpf_inode(inode);
> + if (!bsb)
> + return ERR_PTR(-ENOENT);
ERR_PTR is returned here...

> +
> + inode_storage = rcu_dereference(bsb->storage);
> + if (!inode_storage)
> + return NULL;
> +
> + smap = (struct bpf_local_storage_map *)map;
> + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
> +}
> +
> +void 

Re: [PATCH bpf-next v8 3/7] bpf: Generalize bpf_sk_storage

2020-08-17 Thread Martin KaFai Lau
On Mon, Aug 03, 2020 at 06:46:51PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Refactor the functionality in bpf_sk_storage.c so that concept of
> storage linked to kernel objects can be extended to other objects like
> inode, task_struct etc.
> 
> Each new local storage will still be a separate map and provide its own
> set of helpers. This allows for future object specific extensions and
> still share a lot of the underlying implementation.
> 
> This includes the changes suggested by Martin in:
> 
>   https://lore.kernel.org/bpf/20200725013047.4006241-1-ka...@fb.com/
> 
> which adds map_local_storage_charge, map_local_storage_uncharge,
> and map_owner_storage_ptr.
A description will still be useful in the commit message to talk
about the new map_ops, e.g.
they allow kernel object to optionally have different mem-charge strategy.

> 
> Co-developed-by: Martin KaFai Lau 
> Signed-off-by: KP Singh 
> ---
>  include/linux/bpf.h|   9 ++
>  include/net/bpf_sk_storage.h   |  51 +++
>  include/uapi/linux/bpf.h   |   8 +-
>  net/core/bpf_sk_storage.c  | 246 +
>  tools/include/uapi/linux/bpf.h |   8 +-
>  5 files changed, 233 insertions(+), 89 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cef4ef0d2b4e..8e1e23c60dc7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -34,6 +34,9 @@ struct btf_type;
>  struct exception_table_entry;
>  struct seq_operations;
>  struct bpf_iter_aux_info;
> +struct bpf_local_storage;
> +struct bpf_local_storage_map;
> +struct bpf_local_storage_elem;
"struct bpf_local_storage_elem" is not needed.

>  
>  extern struct idr btf_idr;
>  extern spinlock_t btf_idr_lock;
> @@ -104,6 +107,12 @@ struct bpf_map_ops {
>   __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
>struct poll_table_struct *pts);
>  
> + /* Functions called by bpf_local_storage maps */
> + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
> + void *owner, u32 size);
> + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
> +void *owner, u32 size);
> + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
>   /* BTF name and id of struct allocated by map_alloc */
>   const char * const map_btf_name;
>   int *map_btf_id;
> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> index 950c5aaba15e..05b777950eb3 100644
> --- a/include/net/bpf_sk_storage.h
> +++ b/include/net/bpf_sk_storage.h
> @@ -3,8 +3,15 @@
>  #ifndef _BPF_SK_STORAGE_H
>  #define _BPF_SK_STORAGE_H
>  
> +#include 
> +#include 
> +#include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
>  
>  struct sock;
>  
> @@ -34,6 +41,50 @@ u16 bpf_local_storage_cache_idx_get(struct 
> bpf_local_storage_cache *cache);
>  void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
> u16 idx);
>  
> +/* Helper functions for bpf_local_storage */
> +int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
> +
> +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr 
> *attr);
> +
> +struct bpf_local_storage_data *
> +bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
> +  struct bpf_local_storage_map *smap,
> +  bool cacheit_lockit);
> +
> +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
> +
> +int bpf_local_storage_map_check_btf(const struct bpf_map *map,
> + const struct btf *btf,
> + const struct btf_type *key_type,
> + const struct btf_type *value_type);
> +
> +void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
> + struct bpf_local_storage_elem *selem);
> +
> +bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
> +   struct bpf_local_storage_elem *selem,
> +   bool uncharge_omem);
> +
> +void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
> +
> +void bpf_selem_link_map(struct bpf_local_storage_map *smap,
> + struct bpf_local_storage_elem *selem);
> +
> +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
> +
> +struct bpf_local_storage_elem *
> +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
> + bool charge_mem);
> +
> +int

Re: [PATCH bpf-next v8 2/7] bpf: Generalize caching for sk_storage.

2020-08-17 Thread Martin KaFai Lau
On Mon, Aug 03, 2020 at 06:46:50PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Provide the a ability to define local storage caches on a per-object
> type basis. The caches and caching indices for different objects should
> not be inter-mixed as suggested in:
Acked-by: Martin KaFai Lau 


Re: [PATCH bpf-next v8 1/7] A purely mechanical change to split the renaming from the actual generalization.

2020-08-17 Thread Martin KaFai Lau
 hlist_del_init_rcu(>snode);
> - if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
> + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
>   SDATA(selem))
> - RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
> + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
>  
>   kfree_rcu(selem, rcu);
>  
> - return free_sk_storage;
> + return free_local_storage;
>  }
>  
> -static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
> +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>  {
> - struct bpf_sk_storage *sk_storage;
> - bool free_sk_storage = false;
> + struct bpf_local_storage *local_storage;
> + bool free_local_storage = false;
>  
> - if (unlikely(!selem_linked_to_sk(selem)))
> + if (unlikely(!selem_linked_to_storage(selem)))
>   /* selem has already been unlinked from sk */
>   return;
>  
> - sk_storage = rcu_dereference(selem->sk_storage);
> - raw_spin_lock_bh(_storage->lock);
> - if (likely(selem_linked_to_sk(selem)))
> - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
> - raw_spin_unlock_bh(_storage->lock);
> + local_storage = rcu_dereference(selem->local_storage);
> + raw_spin_lock_bh(_storage->lock);
> + if (likely(selem_linked_to_storage(selem)))
> + free_local_storage =
> + bpf_selem_unlink_storage(local_storage, selem, true);
> + raw_spin_unlock_bh(_storage->lock);
>  
> - if (free_sk_storage)
> - kfree_rcu(sk_storage, rcu);
> + if (free_local_storage)
> + kfree_rcu(local_storage, rcu);
>  }
>  
> -static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
> - struct bpf_sk_storage_elem *selem)
> +static void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
> +struct bpf_local_storage_elem *selem)
Same here. bpf_selem_link_storage"_nolock"().

Please tag the Subject line with "bpf:".

Acked-by: Martin KaFai Lau 


Re: [PATCH] bpf: Add bpf_skb_get_sock_comm() helper

2020-08-10 Thread Martin KaFai Lau
On Mon, Aug 10, 2020 at 06:09:48AM -0700, Jiang Yu wrote:
> skb distinguished by uid can only recorded to user who consume them.
> in many case, skb should been recorded more specific to process who
> consume them. E.g, the unexpected large data traffic of illegal process
> in metered network.
> 
> this helper is used in tracing task comm of the sock to which a skb
> belongs.
> 
> Signed-off-by: Jiang Yu 
> ---
>  include/net/sock.h |  1 +
>  include/uapi/linux/bpf.h   |  1 +
>  net/core/filter.c  | 32 
>  net/core/sock.c| 20 
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 55 insertions(+)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 064637d1ddf6..9c6e8e61940f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -519,6 +519,7 @@ struct sock {
>  #ifdef CONFIG_BPF_SYSCALL
>   struct bpf_sk_storage __rcu *sk_bpf_storage;
>  #endif
> + char sk_task_com[TASK_COMM_LEN];
One possibility is to use the "sk_bpf_storage" member immediately above
instead of adding "sk_task_com[]".

It is an extensible sk storage for bpf.  There are examples in selftests,
e.g tools/testing/selftests/bpf/progs/udp_limits.c which creates sk storage
at socket creation time.  Another hook point option could be "connect()"
for tcp, i.e. "cgroup/connect[46]".

Search "BPF_MAP_TYPE_SK_STORAGE" under the selftests/bpf for other examples.

It seems there is already a "bpf_get_current_comm()" helper which
could be used to initialize the task comm string in the bpf sk storage.

btw, bpf-next is still closed.


Re: [PATCH bpf-next v7 5/7] bpf: Implement bpf_local_storage for inodes

2020-07-31 Thread Martin KaFai Lau
On Fri, Jul 31, 2020 at 02:08:55PM +0200, KP Singh wrote:
[ ... ]
> >> +const struct bpf_map_ops inode_storage_map_ops = {
> >> +  .map_alloc_check = bpf_local_storage_map_alloc_check,
> >> +  .map_alloc = inode_storage_map_alloc,
> >> +  .map_free = inode_storage_map_free,
> >> +  .map_get_next_key = notsupp_get_next_key,
> >> +  .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
> >> +  .map_update_elem = bpf_fd_inode_storage_update_elem,
> >> +  .map_delete_elem = bpf_fd_inode_storage_delete_elem,
> >> +  .map_check_btf = bpf_local_storage_map_check_btf,
> >> +  .map_btf_name = "bpf_local_storage_map",
> >> +  .map_btf_id = _storage_map_btf_id,
> >> +  .map_owner_storage_ptr = inode_storage_ptr,
> >> +};
> >> +
> >> +BTF_ID_LIST(bpf_inode_storage_get_btf_ids)
> >> +BTF_ID(struct, inode)
> > The ARG_PTR_TO_BTF_ID is the second arg instead of the first
> > arg in bpf_inode_storage_get_proto.
> > Does it just happen to work here without needing BTF_ID_UNUSED?
> 
> 
> Yeah, this surprised me as to why it worked, so I did some debugging:
> 
> 
> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
> index 9cd1428c7199..95e84bcf1a74 100644
> --- a/kernel/bpf/bpf_lsm.c
> +++ b/kernel/bpf/bpf_lsm.c
> @@ -52,6 +52,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct 
> bpf_prog *prog)
>  {
> switch (func_id) {
> case BPF_FUNC_inode_storage_get:
> +   pr_err("btf_ids[0]=%d\n", 
> bpf_inode_storage_get_proto.btf_id[0]);
> +   pr_err("btf_ids[1]=%d\n", 
> bpf_inode_storage_get_proto.btf_id[1]);
> return _inode_storage_get_proto;
> case BPF_FUNC_inode_storage_delete:
> return _inode_storage_delete_proto;
> 
> ./test_progs -t test_local_storage
> 
> [   21.694473] btf_ids[0]=915
> [   21.694974] btf_ids[1]=915
> 
> btf  dump file /sys/kernel/btf/vmlinux | grep "STRUCT 'inode'"
> "[915] STRUCT 'inode' size=984 vlen=48
> 
> So it seems like btf_id[0] and btf_id[1] are set to the BTF ID
> for inode. Now I think this might just be a coincidence as
> the next helper (bpf_inode_storage_delete) 
> also has a BTF argument of type inode.
It seems the next BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
is not needed because they are the same.  I think one
BTF_ID_LIST(bpf_inode_btf_ids) can be used for both helpers.

> 
> and sure enough if I call:
> 
> bpf_inode_storage_delete from my selftests program, 
> it does not load:
> 
> arg#0 type is not a struct
> Unrecognized arg#0 type PTR
> ; int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
> 0: (79) r6 = *(u64 *)(r1 +8)
> func 'bpf_lsm_inode_unlink' arg1 has btf_id 912 type STRUCT 'dentry'
> ; __u32 pid = bpf_get_current_pid_tgid() >> 32;
> 
> [...]
> 
> So, The BTF_ID_UNUSED is actually needed here. I also added a call to
> bpf_inode_storage_delete in my test to catch any issues with it.
> 
> After adding BTF_ID_UNUSED, the result is what we expect:
> 
> ./test_progs -t test_local_storage
> [   20.577223] btf_ids[0]=0
> [   20.577702] btf_ids[1]=915
> 
> Thanks for noticing this! 
> 
> - KP
> 
> > 
> >> +
> >> +const struct bpf_func_proto bpf_inode_storage_get_proto = {
> >> +  .func   = bpf_inode_storage_get,
> >> +  .gpl_only   = false,
> >> +  .ret_type   = RET_PTR_TO_MAP_VALUE_OR_NULL,
> >> +  .arg1_type  = ARG_CONST_MAP_PTR,
> >> +  .arg2_type  = ARG_PTR_TO_BTF_ID,
> >> +  .arg3_type  = ARG_PTR_TO_MAP_VALUE_OR_NULL,
> >> +  .arg4_type  = ARG_ANYTHING,
> >> +  .btf_id = bpf_inode_storage_get_btf_ids,
> >> +};
> >> +
> >> +BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
> >> +BTF_ID(struct, inode)
> >> +
> >> +const struct bpf_func_proto bpf_inode_storage_delete_proto = {
> >> +  .func   = bpf_inode_storage_delete,
> >> +  .gpl_only   = false,
> >> +  .ret_type   = RET_INTEGER,
> >> +  .arg1_type  = ARG_CONST_MAP_PTR,
> >> +  .arg2_type  = ARG_PTR_TO_BTF_ID,
> >> +  .btf_id = bpf_inode_storage_delete_btf_ids,
> >> +};


Re: [PATCH bpf-next v7 5/7] bpf: Implement bpf_local_storage for inodes

2020-07-30 Thread Martin KaFai Lau
On Thu, Jul 30, 2020 at 04:07:14PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Similar to bpf_local_storage for sockets, add local storage for inodes.
> The life-cycle of storage is managed with the life-cycle of the inode.
> i.e. the storage is destroyed along with the owning inode.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> security blob which are now stackable and can co-exist with other LSMs.
> 

[ ... ]

> +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
> +  void *value, u64 map_flags)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct file *f;
> + int fd;
> +
> + fd = *(int *)key;
> + f = fcheck(fd);
> + if (!f)
> + return -EINVAL;
> +
> + sdata = bpf_local_storage_update(f->f_inode, map, value, map_flags);
n00b question.  inode will not be going away here (like another
task calls close(fd))?  and there is no chance that bpf_local_storage_update()
will be adding new storage after bpf_inode_storage_free() was called?

A few comments will be useful here.

> + return PTR_ERR_OR_ZERO(sdata);
> +}
> +
> +static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + sdata = inode_storage_lookup(inode, map, false);
> + if (!sdata)
> + return -ENOENT;
> +
> + bpf_selem_unlink(SELEM(sdata));
> +
> + return 0;
> +}
> +
> +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
> +{
> + struct file *f;
> + int fd;
> +
> + fd = *(int *)key;
> + f = fcheck(fd);
> + if (!f)
> + return -EINVAL;
> +
> + return inode_storage_delete(f->f_inode, map);
> +}
> +
> +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, 
> inode,
> +void *, value, u64, flags)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> + return (unsigned long)NULL;
> +
> + sdata = inode_storage_lookup(inode, map, true);
> + if (sdata)
> + return (unsigned long)sdata->data;
> +
> + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> + sdata = bpf_local_storage_update(inode, map, value,
> +  BPF_NOEXIST);
The same question here

> + return IS_ERR(sdata) ? (unsigned long)NULL :
> +  (unsigned long)sdata->data;
> + }
> +
> + return (unsigned long)NULL;
> +}
> +
> +BPF_CALL_2(bpf_inode_storage_delete,
> +struct bpf_map *, map, struct inode *, inode)
> +{
> + return inode_storage_delete(inode, map);
> +}
> +
> +static int notsupp_get_next_key(struct bpf_map *map, void *key,
> + void *next_key)
> +{
> + return -ENOTSUPP;
> +}
> +
> +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
> +{
> + struct bpf_local_storage_map *smap;
> +
> + smap = bpf_local_storage_map_alloc(attr);
> + if (IS_ERR(smap))
> + return ERR_CAST(smap);
> +
> + smap->cache_idx = bpf_local_storage_cache_idx_get(_cache);
> + return >map;
> +}
> +
> +static void inode_storage_map_free(struct bpf_map *map)
> +{
> + struct bpf_local_storage_map *smap;
> +
> + smap = (struct bpf_local_storage_map *)map;
> + bpf_local_storage_cache_idx_free(_cache, smap->cache_idx);
> + bpf_local_storage_map_free(smap);
> +}
> +
> +static int sk_storage_map_btf_id;
> +const struct bpf_map_ops inode_storage_map_ops = {
> + .map_alloc_check = bpf_local_storage_map_alloc_check,
> + .map_alloc = inode_storage_map_alloc,
> + .map_free = inode_storage_map_free,
> + .map_get_next_key = notsupp_get_next_key,
> + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
> + .map_update_elem = bpf_fd_inode_storage_update_elem,
> + .map_delete_elem = bpf_fd_inode_storage_delete_elem,
> + .map_check_btf = bpf_local_storage_map_check_btf,
> + .map_btf_name = "bpf_local_storage_map",
> + .map_btf_id = _storage_map_btf_id,
> + .map_owner_storage_ptr = inode_storage_ptr,
> +};
> +
> +BTF_ID_LIST(bpf_inode_storage_get_btf_ids)
> +BTF_ID(struct, inode)
The ARG_PTR_TO_BTF_ID is the second arg instead of the first
arg in bpf_inode_storage_get_proto.
Does it just happen to work here without needing BTF_ID_UNUSED?

> +
> +const struct bpf_func_proto bpf_inode_storage_get_proto = {
> + .func   = bpf_inode_storage_get,
> + .gpl_only   = false,
> + .ret_type   = RET_PTR_TO_MAP_VALUE_OR_NULL,
> + .arg1_type  = ARG_CONST_MAP_PTR,
> + .arg2_type  = ARG_PTR_TO_BTF_ID,
> + .arg3_type  = ARG_PTR_TO_MAP_VALUE_OR_NULL,
> + .arg4_type  = ARG_ANYTHING,
> + .btf_id = bpf_inode_storage_get_btf_ids,
> +};
> +
> +BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
> +BTF_ID(struct, inode)
> +
> +const 

Re: [RFC PATCH bpf-next] bpf: POC on local_storage charge and uncharge map_ops

2020-07-27 Thread Martin KaFai Lau
On Mon, Jul 27, 2020 at 10:26:53PM +0200, KP Singh wrote:
> Thanks for this, I was able to update the series with this patch and it works.
> One minor comment though.
> 
> I was wondering how should I send it as a part of the series. I will keep the
> original commit description + mention this thread and add your 
> Co-Developed-by:
> tag and then you can add your Signed-off-by: as well.
Sounds good to me.

Thanks for verifying the idea.  Feel free to make changes or clean up on
this RFC.

> I am not sure of the 
> canonical way here and am open to suggestions :)
> 
> - KP
> 
> On 25.07.20 03:30, Martin KaFai Lau wrote:
> > It is a direct replacement of the patch 3 in discussion [1]
> > and to test out the idea on adding
> > map_local_storage_charge, map_local_storage_uncharge,
> > and map_owner_storage_ptr.
> > 
> > It is only compiler tested to demo the idea.
> > 
> > [1]: 
> > https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_project_netdev_patch_20200723115032.460770-2D4-2Dkpsingh-40chromium.org_=DwICaQ=5VD0RTtNlTh3ycd41b3MUw=VQnoQ7LvghIj0gVEaiQSUw=NZVmomh5sMPlIAqLmFQ_MXlMILuq1Z7TQqntbPoZ0ew=MLVevCJz2eNWswxXXF3jFYdAV2UG-xJEi0I1PkLL-fw=
> >  
> > 
> > Signed-off-by: Martin KaFai Lau 
> > ---
> >  include/linux/bpf.h|  10 ++
> >  include/net/bpf_sk_storage.h   |  51 +++
> >  include/uapi/linux/bpf.h   |   8 +-
> 
> [...]
> 
> > +
> > +static void sk_storage_uncharge(struct bpf_local_storage_map *smap,
> > +   void *owner, u32 size)
> > +{
> > +   struct sock *sk = owner;
> > +
> > +   atomic_sub(size, >sk_omem_alloc);
> > +}
> > +
> > +static struct bpf_local_storage __rcu **
> > +sk_storage_ptr(struct bpf_local_storage_map *smap, void *owner)
> 
> Do we need an smap pointer here? It's not being used and is also not
> used for inode as well.
You are correct.  No, it is not needed.
I threw in there merely because it is a map_ops.  It is unused
and can be removed.

> > +{
> > +   struct sock *sk = owner;
> > +
> > +   return >sk_bpf_storage;
> > +}
> > +


[RFC PATCH bpf-next] bpf: POC on local_storage charge and uncharge map_ops

2020-07-24 Thread Martin KaFai Lau
It is a direct replacement of the patch 3 in discussion [1]
and to test out the idea on adding
map_local_storage_charge, map_local_storage_uncharge,
and map_owner_storage_ptr.

It is only compiler tested to demo the idea.

[1]: 
https://patchwork.ozlabs.org/project/netdev/patch/20200723115032.460770-4-kpsi...@chromium.org/

Signed-off-by: Martin KaFai Lau 
---
 include/linux/bpf.h|  10 ++
 include/net/bpf_sk_storage.h   |  51 +++
 include/uapi/linux/bpf.h   |   8 +-
 net/core/bpf_sk_storage.c  | 250 +
 tools/include/uapi/linux/bpf.h |   8 +-
 5 files changed, 236 insertions(+), 91 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 72221aea1c60..d4eab7ccbb51 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -33,6 +33,9 @@ struct btf;
 struct btf_type;
 struct exception_table_entry;
 struct seq_operations;
+struct bpf_local_storage;
+struct bpf_local_storage_map;
+struct bpf_local_storage_elem;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -93,6 +96,13 @@ struct bpf_map_ops {
__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
 struct poll_table_struct *pts);
 
+   /* Functions called by bpf_local_storage maps */
+   int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
+   void *owner, u32 size);
+   void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
+  void *owner, u32 size);
+   struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(struct 
bpf_local_storage_map *smap,
+  void *owner);
/* BTF name and id of struct allocated by map_alloc */
const char * const map_btf_name;
int *map_btf_id;
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 950c5aaba15e..05b777950eb3 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -3,8 +3,15 @@
 #ifndef _BPF_SK_STORAGE_H
 #define _BPF_SK_STORAGE_H
 
+#include 
+#include 
+#include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 
 struct sock;
 
@@ -34,6 +41,50 @@ u16 bpf_local_storage_cache_idx_get(struct 
bpf_local_storage_cache *cache);
 void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
  u16 idx);
 
+/* Helper functions for bpf_local_storage */
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr 
*attr);
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+struct bpf_local_storage_map *smap,
+bool cacheit_lockit);
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+   const struct btf *btf,
+   const struct btf_type *key_type,
+   const struct btf_type *value_type);
+
+void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+   struct bpf_local_storage_elem *selem);
+
+bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_omem);
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+   struct bpf_local_storage_elem *selem);
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
+   bool charge_mem);
+
+int
+bpf_local_storage_alloc(void *owner,
+   struct bpf_local_storage_map *smap,
+   struct bpf_local_storage_elem *first_selem);
+
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
+u64 map_flags);
+
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
 struct bpf_sk_storage_diag *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 54d0c886e3ba..b9d2e4792d08 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3630,9 +3630,13 @@ enum {
BPF_F_SYSCTL_BASE_NAME  = (1ULL << 0),
 };
 
-/* BPF_FUNC_sk_storage_get flags */
+/* BPF_FUNC__storage_get flags */
 enum {
-   BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0),
+   BPF_LOCAL_STORAGE_GET_F_CREATE  = (1ULL << 0),
+   /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
+* and BPF_LOCAL_STORAGE

Re: [PATCH bpf-next v6 3/7] bpf: Generalize bpf_sk_storage

2020-07-24 Thread Martin KaFai Lau
On Thu, Jul 23, 2020 at 01:50:28PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Refactor the functionality in bpf_sk_storage.c so that concept of
> storage linked to kernel objects can be extended to other objects like
> inode, task_struct etc.
> 
> Each new local storage will still be a separate map and provide its own
> set of helpers. This allows for future object specific extensions and
> still share a lot of the underlying implementation.
> 

[ ... ]

> @@ -386,54 +407,28 @@ static int sk_storage_alloc(struct sock *sk,
>   * Otherwise, it will become a leak (and other memory issues
>   * during map destruction).
>   */
> -static struct bpf_local_storage_data *
> -bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
> +struct bpf_local_storage_data *
> +bpf_local_storage_update(void *owner, struct bpf_map *map,
> +  struct bpf_local_storage *local_storage, void *value,
>u64 map_flags)
>  {
>   struct bpf_local_storage_data *old_sdata = NULL;
>   struct bpf_local_storage_elem *selem;
> - struct bpf_local_storage *local_storage;
>   struct bpf_local_storage_map *smap;
>   int err;
>  
> - /* BPF_EXIST and BPF_NOEXIST cannot be both set */
> - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
> - /* BPF_F_LOCK can only be used in a value with spin_lock */
> - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
> - return ERR_PTR(-EINVAL);
> -
>   smap = (struct bpf_local_storage_map *)map;
> - local_storage = rcu_dereference(sk->sk_bpf_storage);
> - if (!local_storage || hlist_empty(_storage->list)) {
> - /* Very first elem for this object */
> - err = check_flags(NULL, map_flags);
This check_flags here is missing in the later sk_storage_update().

> - if (err)
> - return ERR_PTR(err);
> -
> - selem = bpf_selem_alloc(smap, sk, value, true);
> - if (!selem)
> - return ERR_PTR(-ENOMEM);
> -
> - err = sk_storage_alloc(sk, smap, selem);
> - if (err) {
> - kfree(selem);
> - atomic_sub(smap->elem_size, >sk_omem_alloc);
> - return ERR_PTR(err);
> - }
> -
> - return SDATA(selem);
> - }
>  
>   if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
>   /* Hoping to find an old_sdata to do inline update
>* such that it can avoid taking the local_storage->lock
>* and changing the lists.
>*/
> - old_sdata =
> - bpf_local_storage_lookup(local_storage, smap, false);
> + old_sdata = bpf_local_storage_lookup(local_storage, smap, 
> false);
>   err = check_flags(old_sdata, map_flags);
>   if (err)
>   return ERR_PTR(err);
> +
>   if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
>   copy_map_value_locked(map, old_sdata->data,
> value, false);

[ ... ]

> +static struct bpf_local_storage_data *
> +sk_storage_update(void *owner, struct bpf_map *map, void *value, u64 
> map_flags)
> +{
> + struct bpf_local_storage_data *old_sdata = NULL;
> + struct bpf_local_storage_elem *selem;
> + struct bpf_local_storage *local_storage;
> + struct bpf_local_storage_map *smap;
> + struct sock *sk;
> + int err;
> +
> + err = bpf_local_storage_check_update_flags(map, map_flags);
> + if (err)
> + return ERR_PTR(err);
> +
> + sk = owner;
> + local_storage = rcu_dereference(sk->sk_bpf_storage);
> + smap = (struct bpf_local_storage_map *)map;
> +
> + if (!local_storage || hlist_empty(_storage->list)) {

"check_flags(NULL, map_flags);" is gone in this refactoring.

This part of code is copied into the inode_storage_update()
in the latter patch which then has the same issue.

> + /* Very first elem */
> + selem = map->ops->map_selem_alloc(smap, owner, value, 
> !old_sdata);
> + if (!selem)
> + return ERR_PTR(-ENOMEM);

>  static int sk_storage_map_btf_id;
>  const struct bpf_map_ops sk_storage_map_ops = {
> - .map_alloc_check = bpf_sk_storage_map_alloc_check,
> - .map_alloc = bpf_local_storage_map_alloc,
> - .map_free = bpf_local_storage_map_free,
> + .map_alloc_check = bpf_local_storage_map_alloc_check,
> + .map_alloc = sk_storage_map_alloc,
> + .map_free = sk_storage_map_free,
>   .map_get_next_key = notsupp_get_next_key,
>   .map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
>   .map_update_elem = bpf_fd_sk_storage_update_elem,
>   .map_delete_elem = bpf_fd_sk_storage_delete_elem,
> - .map_check_btf = bpf_sk_storage_map_check_btf,
> + .map_check_btf = 

Re: [PATCH bpf-next v6 1/7] bpf: Renames to prepare for generalizing sk_storage.

2020-07-23 Thread Martin KaFai Lau
On Thu, Jul 23, 2020 at 01:50:26PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> A purely mechanical change to split the renaming from the actual
> generalization.
> 
> Flags/consts:
> 
>   SK_STORAGE_CREATE_FLAG_MASK BPF_LOCAL_STORAGE_CREATE_FLAG_MASK
>   BPF_SK_STORAGE_CACHE_SIZE   BPF_LOCAL_STORAGE_CACHE_SIZE
>   MAX_VALUE_SIZE  BPF_LOCAL_STORAGE_MAX_VALUE_SIZE
> 
> Structs:
> 
>   bucket  bpf_local_storage_map_bucket
>   bpf_sk_storage_map  bpf_local_storage_map
>   bpf_sk_storage_data bpf_local_storage_data
>   bpf_sk_storage_elem bpf_local_storage_elem
>   bpf_sk_storage  bpf_local_storage
>   selem_linked_to_sk  selem_linked_to_storage
>   selem_alloc bpf_selem_alloc
> 
> The "sk" member in bpf_local_storage is also updated to "owner"
> in preparation for changing the type to void * in a subsequent patch.
> 
> Functions:
> 
>   __selem_unlink_sk   bpf_selem_unlink_storage
>   __selem_link_sk bpf_selem_link_storage
>   selem_unlink_sk __bpf_selem_unlink_storage
>   sk_storage_update   bpf_local_storage_update
>   __sk_storage_lookup bpf_local_storage_lookup
>   bpf_sk_storage_map_free bpf_local_storage_map_free
>   bpf_sk_storage_map_allocbpf_local_storage_map_alloc
>   bpf_sk_storage_map_alloc_check  bpf_local_storage_map_alloc_check
>   bpf_sk_storage_map_check_btfbpf_local_storage_map_check_btf
Thanks for separating this mechanical name change in a separate patch.
It is much easier to follow.  This patch looks good.

A minor thought is, when I look at unlink_map() and unlink_storage(),
it keeps me looking back for the lock situation.  I think
the main reason is the bpf_selem_unlink_map() is locked but
bpf_selem_unlink_storage() is unlocked now.  May be:

bpf_selem_unlink_map()  => bpf_selem_unlink_map_locked()
bpf_selem_link_map()=> bpf_selem_link_map_locked()
__bpf_selem_unlink_storage()=> bpf_selem_unlink_storage_locked()
bpf_link_storage() means unlocked
bpf_unlink_storage() means unlocked.

I think it could be one follow-up patch later instead of interrupting
multiple patches in this set for this minor thing.  For now, lets
continue with this and remember default is nolock for storage.

I will continue tomorrow.


Re: [PATCH bpf-next v4 2/4] bpf: Implement bpf_local_storage for inodes

2020-07-15 Thread Martin KaFai Lau
On Thu, Jul 09, 2020 at 12:12:37PM +0200, KP Singh wrote:
> From: KP Singh 
> 
> Similar to bpf_local_storage for sockets, add local storage for inodes.
> The life-cycle of storage is managed with the life-cycle of the inode.
> i.e. the storage is destroyed along with the owning inode.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> security blob which are now stackable and can co-exist with other LSMs.
> 
> Signed-off-by: KP Singh 

[ ... ]


> +static void *bpf_inode_storage_lookup_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct inode *inode;
> + int err = -EINVAL;
> +
> + if (key) {
> + inode = *(struct inode **)(key);
The bpf_inode_storage_lookup_elem() here and the (update|delete)_elem() below
are called from the userspace syscall.  How the userspace may provide this key?

> + sdata = inode_storage_lookup(inode, map, true);
> + return sdata ? sdata->data : NULL;
> + }
> +
> + return ERR_PTR(err);
> +}
> +
> +static int bpf_inode_storage_update_elem(struct bpf_map *map, void *key,
> +  void *value, u64 map_flags)
> +{
> + struct bpf_local_storage_data *sdata;
> + struct inode *inode;
> + int err = -EINVAL;
> +
> + if (key) {
> + inode = *(struct inode **)(key);
> + sdata = map->ops->map_local_storage_update(inode, map, value,
> +map_flags);
> + return PTR_ERR_OR_ZERO(sdata);
> + }
> + return err;
> +}
> +
> +static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
> +{
> + struct bpf_local_storage_data *sdata;
> +
> + sdata = inode_storage_lookup(inode, map, false);
> + if (!sdata)
> + return -ENOENT;
> +
> + bpf_selem_unlink_map_elem(SELEM(sdata));
> +
> + return 0;
> +}
> +
> +static int bpf_inode_storage_delete_elem(struct bpf_map *map, void *key)
> +{
> + struct inode *inode;
> + int err = -EINVAL;
> +
> + if (key) {
> + inode = *(struct inode **)(key);
> + err = inode_storage_delete(inode, map);
> + }
> +
> + return err;
> +}
> +

[ ... ]

> +static int inode_storage_map_btf_id;
> +const struct bpf_map_ops inode_storage_map_ops = {
> + .map_alloc_check = bpf_local_storage_map_alloc_check,
> + .map_alloc = inode_storage_map_alloc,
> + .map_free = inode_storage_map_free,
> + .map_get_next_key = notsupp_get_next_key,
> + .map_lookup_elem = bpf_inode_storage_lookup_elem,
> + .map_update_elem = bpf_inode_storage_update_elem,
> + .map_delete_elem = bpf_inode_storage_delete_elem,
> + .map_check_btf = bpf_local_storage_map_check_btf,
> + .map_btf_name = "bpf_local_storage_map",
> + .map_btf_id = _storage_map_btf_id,
> + .map_local_storage_alloc = inode_storage_alloc,
> + .map_selem_alloc = inode_selem_alloc,
> + .map_local_storage_update = inode_storage_update,
> + .map_local_storage_unlink = unlink_inode_storage,
> +};
> +


  1   2   3   >