When updating an existing element in lru_[percpu_,]hash maps, the current
implementation always calls prealloc_lru_pop() to get a new node before
checking if the key already exists. If the map is full, this triggers
LRU eviction and removes an existing element, even though the update
operation only needs to modify the value of an existing key in-place.

This is problematic because:
1. Users may unexpectedly lose entries when doing simple value updates
2. The eviction overhead is unnecessary for existing key updates

Fix this by first checking if the key exists before allocating a new
node. If the key is found, update the value using the extra lru node
without triggering any eviction.

Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH")
Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH")
Signed-off-by: Leon Hwang <[email protected]>
---
 kernel/bpf/bpf_lru_list.c | 164 +++++++++++++++++++++++++++++++++++---
 kernel/bpf/bpf_lru_list.h |   5 +-
 kernel/bpf/hashtab.c      |  85 ++++++++++++++++++--
 3 files changed, 239 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 563707af8035..142b0f10b011 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -124,6 +124,41 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
        list_move(&node->list, &l->lists[tgt_type]);
 }
 
+static struct bpf_lru_node *__bpf_lru_node_move_from_extra(struct bpf_lru_list 
*l,
+                                                          enum 
bpf_lru_list_type tgt_type)
+{
+       struct bpf_lru_node *node;
+
+       node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list);
+       if (!node)
+               return NULL;
+
+       if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+               return NULL;
+
+       bpf_lru_list_count_inc(l, tgt_type);
+       bpf_lru_node_reset_state(node, tgt_type);
+       list_move(&node->list, &l->lists[tgt_type]);
+       return node;
+}
+
+static bool __bpf_lru_node_move_to_extra(struct bpf_lru_list *l,
+                                        struct bpf_lru_node *node)
+{
+       if (!list_empty(&l->extra))
+               return false;
+
+       if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+               return false;
+
+       bpf_lru_move_next_inactive_rotation(l, node);
+
+       bpf_lru_list_count_dec(l, node->type);
+       bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+       list_move(&node->list, &l->extra);
+       return true;
+}
+
 static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
 {
        return l->counts[BPF_LRU_LIST_T_INACTIVE] <
@@ -305,6 +340,69 @@ static void __local_list_flush(struct bpf_lru_list *l,
        }
 }
 
+static struct bpf_lru_node *bpf_percpu_lru_pop_extra(struct bpf_lru *lru)
+{
+       int cpu = raw_smp_processor_id();
+       struct bpf_lru_node *node;
+       struct bpf_lru_list *l;
+       unsigned long flags;
+
+       l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+       raw_spin_lock_irqsave(&l->lock, flags);
+
+       node = __bpf_lru_node_move_from_extra(l, BPF_LRU_LIST_T_ACTIVE);
+
+       raw_spin_unlock_irqrestore(&l->lock, flags);
+
+       return node;
+}
+
+static struct bpf_lru_node *bpf_lru_locallist_extra_pop(struct 
bpf_lru_locallist *l)
+{
+       struct bpf_lru_node *node;
+
+       node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list);
+       if (node)
+               list_del(&node->list);
+
+       return node;
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+                                    struct bpf_lru_locallist *loc_l,
+                                    int cpu,
+                                    struct bpf_lru_node *node);
+
+static struct bpf_lru_node *bpf_common_lru_pop_extra(struct bpf_lru *lru)
+{
+       struct bpf_common_lru *clru = &lru->common_lru;
+       int cpu = raw_smp_processor_id();
+       struct bpf_lru_locallist *loc_l;
+       struct bpf_lru_node *node;
+       unsigned long flags;
+
+       loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+       raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+       node = bpf_lru_locallist_extra_pop(loc_l);
+       if (node)
+               __local_list_add_pending(lru, loc_l, cpu, node);
+
+       raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+       return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru)
+{
+       if (lru->percpu)
+               return bpf_percpu_lru_pop_extra(lru);
+       else
+               return bpf_common_lru_pop_extra(lru);
+}
+
 static void bpf_lru_list_push_free(struct bpf_lru_list *l,
                                   struct bpf_lru_node *node)
 {
@@ -496,6 +594,16 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru)
                return bpf_common_lru_pop_free(lru);
 }
 
+static bool bpf_lru_locallist_extra_push(struct bpf_lru_locallist *loc_l, 
struct bpf_lru_node *node)
+{
+       if (!list_empty(&loc_l->extra))
+               return false;
+
+       bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+       list_move(&node->list, &loc_l->extra);
+       return true;
+}
+
 static void bpf_common_lru_push_free(struct bpf_lru *lru,
                                     struct bpf_lru_node *node)
 {
@@ -518,8 +626,10 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
                        goto check_lru_list;
                }
 
-               bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE);
-               list_move(&node->list, local_free_list(loc_l));
+               if (!bpf_lru_locallist_extra_push(loc_l, node)) {
+                       bpf_lru_node_reset_state(node, 
BPF_LRU_LOCAL_LIST_T_FREE);
+                       list_move(&node->list, local_free_list(loc_l));
+               }
 
                raw_spin_unlock_irqrestore(&loc_l->lock, flags);
                return;
@@ -539,7 +649,8 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 
        raw_spin_lock_irqsave(&l->lock, flags);
 
-       __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+       if (!__bpf_lru_node_move_to_extra(l, node))
+               __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 
        raw_spin_unlock_irqrestore(&l->lock, flags);
 }
@@ -554,9 +665,11 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct 
bpf_lru_node *node)
 
 static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
                                    u32 node_offset, u32 elem_size,
-                                   u32 nr_elems)
+                                   u32 nr_elems, u32 nr_extra_elems)
 {
-       struct bpf_lru_list *l = &lru->common_lru.lru_list;
+       struct bpf_common_lru *clru = &lru->common_lru;
+       struct bpf_lru_list *l = &clru->lru_list;
+       int cpu;
        u32 i;
 
        for (i = 0; i < nr_elems; i++) {
@@ -570,11 +683,26 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, 
void *buf,
 
        lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
                                 1, LOCAL_FREE_TARGET);
+
+       if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus()))
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct bpf_lru_locallist *loc_l;
+               struct bpf_lru_node *node;
+
+               loc_l = per_cpu_ptr(clru->local_list, cpu);
+               node = (struct bpf_lru_node *)(buf + node_offset);
+               node->cpu = cpu;
+               bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+               list_add(&node->list, &loc_l->extra);
+               buf += elem_size;
+       }
 }
 
 static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
                                    u32 node_offset, u32 elem_size,
-                                   u32 nr_elems)
+                                   u32 nr_elems, u32 nr_extra_elems)
 {
        u32 i, pcpu_entries;
        int cpu;
@@ -600,17 +728,31 @@ static void bpf_percpu_lru_populate(struct bpf_lru *lru, 
void *buf,
                if (i % pcpu_entries)
                        goto again;
        }
+
+       if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus()))
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct bpf_lru_node *node;
+
+               l = per_cpu_ptr(lru->percpu_lru, cpu);
+               node = (struct bpf_lru_node *)(buf + node_offset);
+               node->cpu = cpu;
+               bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+               list_add(&node->list, &l->extra);
+               buf += elem_size;
+       }
 }
 
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-                     u32 elem_size, u32 nr_elems)
+                     u32 elem_size, u32 nr_elems, u32 nr_extra_elems)
 {
        if (lru->percpu)
                bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
-                                       nr_elems);
+                                       nr_elems, nr_extra_elems);
        else
                bpf_common_lru_populate(lru, buf, node_offset, elem_size,
-                                       nr_elems);
+                                       nr_elems, nr_extra_elems);
 }
 
 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
@@ -620,6 +762,8 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist 
*loc_l, int cpu)
        for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
                INIT_LIST_HEAD(&loc_l->lists[i]);
 
+       INIT_LIST_HEAD(&loc_l->extra);
+
        loc_l->next_steal = cpu;
 
        raw_spin_lock_init(&loc_l->lock);
@@ -637,6 +781,8 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 
        l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 
+       INIT_LIST_HEAD(&l->extra);
+
        raw_spin_lock_init(&l->lock);
 }
 
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 29e8300e0fd1..446779341b34 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -33,12 +33,14 @@ struct bpf_lru_list {
        unsigned int counts[NR_BPF_LRU_LIST_COUNT];
        /* The next inactive list rotation starts from here */
        struct list_head *next_inactive_rotation;
+       struct list_head extra; /* for percpu lru */
 
        raw_spinlock_t lock ____cacheline_aligned_in_smp;
 };
 
 struct bpf_lru_locallist {
        struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+       struct list_head extra; /* for common lru */
        u16 next_steal;
        raw_spinlock_t lock;
 };
@@ -71,9 +73,10 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node 
*node)
 int bpf_lru_init(struct bpf_lru *lru, bool percpu,
                 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-                     u32 elem_size, u32 nr_elems);
+                     u32 elem_size, u32 nr_elems, u32 nr_extra_elems);
 void bpf_lru_destroy(struct bpf_lru *lru);
 struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru);
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru);
 
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d029690246f8..8665eb6b8a7d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -207,12 +207,12 @@ static struct htab_elem *get_htab_elem(struct bpf_htab 
*htab, int i)
 }
 
 /* Both percpu and fd htab support in-place update, so no need for
- * extra elem. LRU itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
+ * extra elem. LRU requires extra elems to avoid unintended eviction when
+ * updating the existing elems.
  */
 static bool htab_has_extra_elems(struct bpf_htab *htab)
 {
-       return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
+       return htab_is_lru(htab) || (!htab_is_percpu(htab) && 
!is_fd_htab(htab));
 }
 
 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
@@ -313,6 +313,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab 
*htab, void *key,
 static int prealloc_init(struct bpf_htab *htab)
 {
        u32 num_entries = htab->map.max_entries;
+       u32 lru_num_entries = num_entries;
        int err = -ENOMEM, i;
 
        if (htab_has_extra_elems(htab))
@@ -354,7 +355,8 @@ static int prealloc_init(struct bpf_htab *htab)
        if (htab_is_lru(htab))
                bpf_lru_populate(&htab->lru, htab->elems,
                                 offsetof(struct htab_elem, lru_node),
-                                htab->elem_size, num_entries);
+                                htab->elem_size, lru_num_entries,
+                                num_entries - lru_num_entries);
        else
                pcpu_freelist_populate(&htab->freelist,
                                       htab->elems + offsetof(struct htab_elem, 
fnode),
@@ -557,7 +559,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                if (err)
                        goto free_map_locked;
 
-               if (htab_has_extra_elems(htab)) {
+               if (htab_has_extra_elems(htab) && !htab_is_lru(htab)) {
                        err = alloc_extra_elems(htab);
                        if (err)
                                goto free_prealloc;
@@ -1182,6 +1184,69 @@ static void htab_lru_push_free(struct bpf_htab *htab, 
struct htab_elem *elem)
        bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, 
void *value,
+                                            u64 map_flags, struct bucket *b,
+                                            struct hlist_nulls_head *head, u32 
hash,
+                                            bool percpu, bool onallcpus)
+{
+       struct htab_elem *l_new, *l_old, *l_free;
+       struct bpf_map *map = &htab->map;
+       u32 key_size = map->key_size;
+       struct bpf_lru_node *node;
+       unsigned long flags;
+       void *l_val;
+       int ret;
+
+       node = bpf_lru_pop_extra(&htab->lru);
+       if (!node)
+               return -ENOENT;
+
+       l_new = container_of(node, struct htab_elem, lru_node);
+       l_new->hash = hash;
+       memcpy(l_new->key, key, key_size);
+       if (!percpu) {
+               l_val = htab_elem_value(l_new, map->key_size);
+               copy_map_value(map, l_val, value);
+               bpf_obj_free_fields(map->record, l_val);
+       }
+
+       ret = htab_lock_bucket(b, &flags);
+       if (ret)
+               goto err_lock_bucket;
+
+       l_old = lookup_elem_raw(head, hash, key, key_size);
+
+       ret = check_flags(htab, l_old, map_flags);
+       if (ret)
+               goto err;
+
+       if (l_old) {
+               bpf_lru_node_set_ref(&l_new->lru_node);
+               if (percpu) {
+                       /* per-cpu hash map can update value in-place.
+                        * Keep the same logic in 
__htab_lru_percpu_map_update_elem().
+                        */
+                       pcpu_copy_value(htab, htab_elem_get_ptr(l_old, 
key_size),
+                                       value, onallcpus);
+                       l_free = l_new;
+               } else {
+                       hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+                       hlist_nulls_del_rcu(&l_old->hash_node);
+                       l_free = l_old;
+               }
+       } else {
+               ret = -ENOENT;
+       }
+
+err:
+       htab_unlock_bucket(b, flags);
+
+err_lock_bucket:
+       bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
+
+       return ret;
+}
+
 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void 
*value,
                                     u64 map_flags)
 {
@@ -1206,6 +1271,11 @@ static long htab_lru_map_update_elem(struct bpf_map 
*map, void *key, void *value
        b = __select_bucket(htab, hash);
        head = &b->head;
 
+       ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, 
head, hash, false,
+                                               false);
+       if (!ret)
+               return 0;
+
        /* For LRU, we need to alloc before taking bucket's
         * spinlock because getting free nodes from LRU may need
         * to remove older elements from htab and this removal
@@ -1336,6 +1406,11 @@ static long __htab_lru_percpu_map_update_elem(struct 
bpf_map *map, void *key,
        b = __select_bucket(htab, hash);
        head = &b->head;
 
+       ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, 
head, hash, true,
+                                               onallcpus);
+       if (!ret)
+               return 0;
+
        /* For LRU, we need to alloc before taking bucket's
         * spinlock because LRU's elem alloc may need
         * to remove older elem from htab and this removal
-- 
2.52.0


Reply via email to