Re: [PATCH net] bpf: fix hashmap extra_elems logic

2017-03-22 Thread David Miller
From: Alexei Starovoitov 
Date: Tue, 21 Mar 2017 19:05:04 -0700

> In both kmalloc and prealloc mode the bpf_map_update_elem() is using
> per-cpu extra_elems to do atomic update when the map is full.
> There are two issues with it. The logic can be misused, since it allows
> max_entries+num_cpus elements to be present in the map. And 
> alloc_extra_elems()
> at map creation time can fail percpu alloc for large map values with a warn:
> WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60
> illegal size (32824) or align (8) for percpu allocation
> 
> The fixes for both of these issues are different for kmalloc and prealloc 
> modes.
> For prealloc mode allocate extra num_possible_cpus elements and store
> their pointers into extra_elems array instead of actual elements.
> Hence we can use these hidden(spare) elements not only when the map is full
> but during bpf_map_update_elem() that replaces existing element too.
> That also improves performance, since pcpu_freelist_pop/push is avoided.
> Unfortunately this approach cannot be used for kmalloc mode which needs
> to kfree elements after rcu grace period. Therefore switch it back to normal
> kmalloc even when full and old element exists like it was prior to
> commit 6c9059817432 ("bpf: pre-allocate hash map elements").
> 
> Add tests to check for over max_entries and large map values.
> 
> Reported-by: Dave Jones 
> Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
> Signed-off-by: Alexei Starovoitov 
> Acked-by: Daniel Borkmann 
> Acked-by: Martin KaFai Lau 

Applied and queued up for -stable, thanks.


[PATCH net] bpf: fix hashmap extra_elems logic

2017-03-21 Thread Alexei Starovoitov
In both kmalloc and prealloc mode the bpf_map_update_elem() is using
per-cpu extra_elems to do atomic update when the map is full.
There are two issues with it. The logic can be misused, since it allows
max_entries+num_cpus elements to be present in the map. And alloc_extra_elems()
at map creation time can fail percpu alloc for large map values with a warn:
WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60
illegal size (32824) or align (8) for percpu allocation

The fixes for both of these issues are different for kmalloc and prealloc modes.
For prealloc mode allocate extra num_possible_cpus elements and store
their pointers into extra_elems array instead of actual elements.
Hence we can use these hidden(spare) elements not only when the map is full
but during bpf_map_update_elem() that replaces existing element too.
That also improves performance, since pcpu_freelist_pop/push is avoided.
Unfortunately this approach cannot be used for kmalloc mode which needs
to kfree elements after rcu grace period. Therefore switch it back to normal
kmalloc even when full and old element exists like it was prior to
commit 6c9059817432 ("bpf: pre-allocate hash map elements").

Add tests to check for over max_entries and large map values.

Reported-by: Dave Jones 
Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 kernel/bpf/hashtab.c| 144 
 tools/testing/selftests/bpf/test_maps.c |  29 ++-
 2 files changed, 97 insertions(+), 76 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index afe5bab376c9..361a69dfe543 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -30,18 +30,12 @@ struct bpf_htab {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
-   void __percpu *extra_elems;
+   struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets;  /* number of hash buckets */
u32 elem_size;  /* size of each element in bytes */
 };
 
-enum extra_elem_state {
-   HTAB_NOT_AN_EXTRA_ELEM = 0,
-   HTAB_EXTRA_ELEM_FREE,
-   HTAB_EXTRA_ELEM_USED
-};
-
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
union {
@@ -56,7 +50,6 @@ struct htab_elem {
};
union {
struct rcu_head rcu;
-   enum extra_elem_state state;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 }
 
+static bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+   return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
 void __percpu *pptr)
 {
@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab 
*htab, void *key,
 
 static int prealloc_init(struct bpf_htab *htab)
 {
+   u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
 
-   htab->elems = bpf_map_area_alloc(htab->elem_size *
-htab->map.max_entries);
+   if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+   num_entries += num_possible_cpus();
+
+   htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems)
return -ENOMEM;
 
if (!htab_is_percpu(htab))
goto skip_percpu_elems;
 
-   for (i = 0; i < htab->map.max_entries; i++) {
+   for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
 
@@ -166,11 +167,11 @@ static int prealloc_init(struct bpf_htab *htab)
if (htab_is_lru(htab))
bpf_lru_populate(>lru, htab->elems,
 offsetof(struct htab_elem, lru_node),
-htab->elem_size, htab->map.max_entries);
+htab->elem_size, num_entries);
else
pcpu_freelist_populate(>freelist,
   htab->elems + offsetof(struct htab_elem, 
fnode),
-  htab->elem_size, htab->map.max_entries);
+  htab->elem_size, num_entries);
 
return 0;
 
@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
 
 static int alloc_extra_elems(struct bpf_htab *htab)
 {
-   void __percpu *pptr;
+   struct htab_elem *__percpu *pptr, *l_new;
+   struct pcpu_freelist_node *l;
int cpu;
 
-   pptr = __alloc_percpu_gfp(htab->elem_size, 8,