slab: introduce kfree_rcu_nolock()

Joel Fernandes Mon, 16 Feb 2026 13:08:15 -0800

Hi Harry,

On Fri, Feb 06, 2026 at 06:34:09PM +0900, Harry Yoo wrote:
> Currently, kfree_rcu() cannot be called in an NMI context.
> In such a context, even calling call_rcu() is not legal,
> forcing users to implement deferred freeing.
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant.
> Unlike kfree_rcu(), kfree_rcu_nolock() only supports a 2-argument
> variant, because, in the worst case where memory allocation fails,
> the caller cannot synchronously wait for the grace period to finish.
> 
> Similar to kfree_nolock() implementation, try to acquire kfree_rcu_cpu
> spinlock, and if that fails, insert the object to per-cpu lockless list
> and delay freeing using irq_work that calls kvfree_call_rcu() later.
> In case kmemleak or debugobjects is enabled, always defer freeing as
> those debug features don't support NMI contexts.
> 
> When trylock succeeds, avoid consuming bnode and run_page_cache_worker()
> altogether. Instead, insert objects into struct kfree_rcu_cpu.head
> without consuming additional memory.
> 
> For now, the sheaves layer is bypassed if spinning is not allowed.
> 
> Scheduling delayed monitor work in an NMI context is tricky; use
> irq_work to schedule, but use lazy irq_work to avoid raising self-IPIs.
> That means scheduling delayed monitor work can be delayed up to the
> length of a time slice.
> 
> Without CONFIG_KVFREE_RCU_BATCHED, all frees in the !allow_spin case are
> delayed using irq_work.
> 
> Suggested-by: Alexei Starovoitov <[email protected]>
> Signed-off-by: Harry Yoo <[email protected]>
> ---
>  include/linux/rcupdate.h |  23 ++++---
>  mm/slab_common.c         | 140 +++++++++++++++++++++++++++++++++------
>  2 files changed, 133 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index db5053a7b0cb..18bb7378b23d 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1092,8 +1092,9 @@ static inline void rcu_read_unlock_migrate(void)
>   * The BUILD_BUG_ON check must not involve any function calls, hence the
>   * checks are done in macros here.
>   */
> -#define kfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf)
> -#define kvfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf)
> +#define kfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf, true)
> +#define kfree_rcu_nolock(ptr, rf) kvfree_rcu_arg_2(ptr, rf, false)
> +#define kvfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf, true)
>  
>  /**
>   * kfree_rcu_mightsleep() - kfree an object after a grace period.
> @@ -1117,35 +1118,35 @@ static inline void rcu_read_unlock_migrate(void)
>  
>  
>  #ifdef CONFIG_KVFREE_RCU_BATCHED
> -void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr);
> -#define kvfree_call_rcu(head, ptr) \
> +void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr, bool allow_spin);
> +#define kvfree_call_rcu(head, ptr, spin) \
>       _Generic((head), \
>               struct rcu_head *: kvfree_call_rcu_ptr,         \
>               struct rcu_ptr *: kvfree_call_rcu_ptr,          \
>               void *: kvfree_call_rcu_ptr                     \
> -     )((struct rcu_ptr *)(head), (ptr))
> +     )((struct rcu_ptr *)(head), (ptr), spin)
>  #else
> -void kvfree_call_rcu_head(struct rcu_head *head, void *ptr);
> +void kvfree_call_rcu_head(struct rcu_head *head, void *ptr, bool allow_spin);
>  static_assert(sizeof(struct rcu_head) == sizeof(struct rcu_ptr));
> -#define kvfree_call_rcu(head, ptr) \
> +#define kvfree_call_rcu(head, ptr, spin) \
>       _Generic((head), \
>               struct rcu_head *: kvfree_call_rcu_head,        \
>               struct rcu_ptr *: kvfree_call_rcu_head,         \
>               void *: kvfree_call_rcu_head                    \
> -     )((struct rcu_head *)(head), (ptr))
> +     )((struct rcu_head *)(head), (ptr), spin)
>  #endif
>  
>  /*
>   * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
>   * comment of kfree_rcu() for details.
>   */
> -#define kvfree_rcu_arg_2(ptr, rf)                                    \
> +#define kvfree_rcu_arg_2(ptr, rf, spin)                                      
> \
>  do {                                                                 \
>       typeof (ptr) ___p = (ptr);                                      \
>                                                                       \
>       if (___p) {                                                     \
>               BUILD_BUG_ON(offsetof(typeof(*(ptr)), rf) >= 4096);     \
> -             kvfree_call_rcu(&((___p)->rf), (void *) (___p));        \
> +             kvfree_call_rcu(&((___p)->rf), (void *) (___p), spin);  \
>       }                                                               \
>  } while (0)
>  
> @@ -1154,7 +1155,7 @@ do {                                                    
>         \
>       typeof(ptr) ___p = (ptr);                               \
>                                                               \
>       if (___p)                                               \
> -             kvfree_call_rcu(NULL, (void *) (___p));         \
> +             kvfree_call_rcu(NULL, (void *) (___p), true);   \
>  } while (0)
>  
>  /*
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index d232b99a4b52..9d7801e5cb73 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1311,6 +1311,12 @@ struct kfree_rcu_cpu_work {
>   * the interactions with the slab allocators.
>   */
>  struct kfree_rcu_cpu {
> +     // Objects queued on a lockless linked list, not protected by the lock.
> +     // This allows freeing objects in NMI context, where trylock may fail.
> +     struct llist_head llist_head;
> +     struct irq_work irq_work;
> +     struct irq_work sched_monitor_irq_work;


It would be great if irq_work_queue() could support a lazy flag, or a new
irq_work_queue_lazy() which then just skips the irq_work_raise() for the lazy
case. Then we don't need multiple struct irq_work doing the same thing. +PeterZ

[...]
> @@ -1979,9 +2059,15 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void 
> *ptr)
>       }
>  
>       kasan_record_aux_stack(ptr);
> -     success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
> +
> +     krcp = krc_this_cpu_lock(&flags, allow_spin);
> +     if (!krcp)
> +             goto defer_free;
> +
> +     success = add_ptr_to_bulk_krc_lock(krcp, &flags, ptr, !head, 
> allow_spin);
>       if (!success) {
> -             run_page_cache_worker(krcp);
> +             if (allow_spin)
> +                     run_page_cache_worker(krcp);
>  
>               if (head == NULL)
>                       // Inline if kvfree_rcu(one_arg) call.
> @@ -2005,8 +2091,12 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void 
> *ptr)
>       kmemleak_ignore(ptr);
>  
>       // Set timer to drain after KFREE_DRAIN_JIFFIES.
> -     if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
> -             __schedule_delayed_monitor_work(krcp);
> +     if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) {
> +             if (allow_spin)
> +                     __schedule_delayed_monitor_work(krcp);
> +             else
> +                     irq_work_queue(&krcp->sched_monitor_irq_work);

Here this irq_work will be queued even if delayed_work_pending? That might be
additional irq_work overhead (which was not needed) when the delayed monitor
was already queued?

If delayed_work_pending() is safe to call from NMI, you could also call
that to avoid unnecessary irq_work queueing. But do double check if it is.

Also per [1], I gather allow_spin does not always imply NMI. If that is true,
is better to call in_nmi() instead of relying on allow_spin?

[1] 
https://lore.kernel.org/all/caadnvqkk_bgi0bc-td_3pvphyxr3cpc3r8rg-nhwdlediqs...@mail.gmail.com/

Thanks,

--
Joel Fernandes



> +     }
>  
>  unlock_return:
>       krc_this_cpu_unlock(krcp, flags);
> @@ -2017,10 +2107,22 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void 
> *ptr)
>        * CPU can pass the QS state.
>        */
>       if (!success) {
> +             VM_WARN_ON_ONCE(!allow_spin);
>               debug_rcu_head_unqueue((struct rcu_head *) ptr);
>               synchronize_rcu();
>               kvfree(ptr);
>       }
> +     return;
> +
> +defer_free:
> +     VM_WARN_ON_ONCE(allow_spin);
> +     guard(preempt)();
> +
> +     krcp = this_cpu_ptr(&krc);
> +     if (llist_add((struct llist_node *)head, &krcp->llist_head))
> +             irq_work_queue(&krcp->irq_work);
> +     return;
> +
>  }
>  EXPORT_SYMBOL_GPL(kvfree_call_rcu_ptr);
>  
> -- 
> 2.43.0
>

Re: [RFC PATCH 6/7] mm/slab: introduce kfree_rcu_nolock()

Reply via email to