slab: extend deferred free mechanism to handle rcu sheaves

Harry Yoo (Oracle) Mon, 15 Jun 2026 04:09:58 -0700

__kfree_rcu_sheaf() cannot invoke call_rcu() when spinning is not
allowed and IRQs are disabled. To relax the limitation, extend the
deferred free fallback so that a full rcu sheaf can be submitted to
call_rcu() via the existing IRQ work.


Since the deferred mechanism does more than deferred free of objects,
rename the struct to deferred_percpu_work and adjust names accordingly.

When a sheaf is queued on an IRQ work, it is detached from
pcs->rcu_free but call_rcu() is not invoked until the irq_work runs.
To keep the kvfree_rcu barrier's promise, call irq_work_sync() on each
CPU before calling rcu_barrier().

In the meantime, remove the TODO item as apparently there is no simple
and effective way to achieve that.

Suggested-by: Alexei Starovoitov <[email protected]>
Signed-off-by: Harry Yoo (Oracle) <[email protected]>
---
 mm/slab.h        |  2 +-
 mm/slab_common.c |  7 ++---
 mm/slub.c        | 79 ++++++++++++++++++++++++++++++++++----------------------
 3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index b1bd33a16544..961581e35ec8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -744,7 +744,7 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void 
*object, struct slab *slab)
 void __check_heap_object(const void *ptr, unsigned long n,
                         const struct slab *slab, bool to_user);
 
-void defer_free_barrier(void);
+void deferred_work_barrier(void);
 
 static inline bool slub_debug_orig_size(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bc1a8ec938d9..55546b8385ff 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -551,7 +551,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
        }
 
        /* Wait for deferred work from kmalloc/kfree_nolock() */
-       defer_free_barrier();
+       deferred_work_barrier();
 
        cpus_read_lock();
        mutex_lock(&slab_mutex);
@@ -2113,13 +2113,10 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
                cpus_read_lock();
                flush_rcu_sheaves_on_cache(s);
                cpus_read_unlock();
+               deferred_work_barrier();
                rcu_barrier();
        }
 
-       /*
-        * TODO: Introduce a version of __kvfree_rcu_barrier() that works
-        * on a specific slab cache.
-        */
        __kvfree_rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
diff --git a/mm/slub.c b/mm/slub.c
index 6a3552b70683..ba593c1c53d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -418,6 +418,8 @@ struct slab_sheaf {
        union {
                struct rcu_head rcu_head;
                struct list_head barn_list;
+               /* only used to defer call_rcu() in unknown context */
+               struct llist_node llnode;
                /* only used for prefilled sheafs */
                struct {
                        unsigned int capacity;
@@ -4071,6 +4073,20 @@ static void flush_all(struct kmem_cache *s)
        cpus_read_unlock();
 }
 
+struct deferred_percpu_work {
+       struct llist_head objects;
+       struct llist_head rcu_sheaves;
+       struct irq_work work;
+};
+
+static void deferred_percpu_work_fn(struct irq_work *work);
+
+static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
+       .objects = LLIST_HEAD_INIT(objects),
+       .rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
+       .work = IRQ_WORK_INIT(deferred_percpu_work_fn),
+};
+
 static void flush_rcu_sheaf(struct work_struct *w)
 {
        struct slub_percpu_sheaves *pcs;
@@ -4142,6 +4158,7 @@ void flush_all_rcu_sheaves(void)
        mutex_unlock(&slab_mutex);
        cpus_read_unlock();
 
+       deferred_work_barrier();
        rcu_barrier();
 }
 
@@ -6158,12 +6175,6 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, 
bool allow_spin)
        if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
                rcu_sheaf = NULL;
        } else {
-               /* call_rcu() disables IRQs to protect percpu data structures */
-               if (unlikely(!allow_spin && irqs_disabled())) {
-                       rcu_sheaf->size--;
-                       local_unlock(&s->cpu_sheaves->lock);
-                       goto fail;
-               }
                pcs->rcu_free = NULL;
                rcu_sheaf->node = numa_node_id();
        }
@@ -6172,8 +6183,18 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, 
bool allow_spin)
         * we flush before local_unlock to make sure a racing
         * flush_all_rcu_sheaves() doesn't miss this sheaf
         */
-       if (rcu_sheaf)
-               call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+       if (rcu_sheaf) {
+               /* call_rcu() disables IRQs to protect percpu data structures */
+               if (unlikely(!allow_spin && irqs_disabled())) {
+                       struct deferred_percpu_work *dpw;
+
+                       dpw = this_cpu_ptr(&deferred_percpu_work);
+                       if (llist_add(&rcu_sheaf->llnode, &dpw->rcu_sheaves))
+                               irq_work_queue(&dpw->work);
+               } else {
+                       call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+               }
+       }
 
        local_unlock(&s->cpu_sheaves->lock);
 
@@ -6360,31 +6381,20 @@ static void free_to_pcs_bulk(struct kmem_cache *s, 
size_t size, void **p)
        }
 }
 
-struct defer_free {
-       struct llist_head objects;
-       struct irq_work work;
-};
-
-static void free_deferred_objects(struct irq_work *work);
-
-static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
-       .objects = LLIST_HEAD_INIT(objects),
-       .work = IRQ_WORK_INIT(free_deferred_objects),
-};
-
 /*
  * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
  * to take sleeping spin_locks from __slab_free().
  * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
  */
-static void free_deferred_objects(struct irq_work *work)
+static void deferred_percpu_work_fn(struct irq_work *work)
 {
-       struct defer_free *df = container_of(work, struct defer_free, work);
-       struct llist_head *objs = &df->objects;
+       struct deferred_percpu_work *dpw;
+       struct llist_head *objs, *rcu_sheaves;
        struct llist_node *llnode, *pos, *t;
 
-       if (llist_empty(objs))
-               return;
+       dpw = container_of(work, struct deferred_percpu_work, work);
+       rcu_sheaves = &dpw->rcu_sheaves;
+       objs = &dpw->objects;
 
        llnode = llist_del_all(objs);
        llist_for_each_safe(pos, t, llnode) {
@@ -6408,27 +6418,34 @@ static void free_deferred_objects(struct irq_work *work)
                __slab_free(s, slab, x, x, 1, _THIS_IP_);
                stat(s, FREE_SLOWPATH);
        }
+
+       llnode = llist_del_all(rcu_sheaves);
+       llist_for_each_safe(pos, t, llnode) {
+               struct slab_sheaf *rcu_sheaf = llist_entry(pos, struct 
slab_sheaf, llnode);
+
+               call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+       }
 }
 
 static void defer_free(struct kmem_cache *s, void *head)
 {
-       struct defer_free *df;
+       struct deferred_percpu_work *dpw;
 
        guard(preempt)();
 
        head = kasan_reset_tag(head);
 
-       df = this_cpu_ptr(&defer_free_objects);
-       if (llist_add(head + s->offset, &df->objects))
-               irq_work_queue(&df->work);
+       dpw = this_cpu_ptr(&deferred_percpu_work);
+       if (llist_add(head + s->offset, &dpw->objects))
+               irq_work_queue(&dpw->work);
 }
 
-void defer_free_barrier(void)
+void deferred_work_barrier(void)
 {
        int cpu;
 
        for_each_possible_cpu(cpu)
-               irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
+               irq_work_sync(&per_cpu_ptr(&deferred_percpu_work, cpu)->work);
 }
 
 static __fastpath_inline

-- 
2.53.0

[PATCH for-next v3 5/9] mm/slab: extend deferred free mechanism to handle rcu sheaves

Reply via email to