On Thu, Jan 23, 2025 at 11:37:20AM +0100, Vlastimil Babka wrote:
> RCU has been special-casing callback function pointers that are integers
> lower than 4096 as offsets of rcu_head for kvfree() instead. The tree
> RCU implementation no longer does that as the batched kvfree_rcu() is
> not a simple call_rcu(). The tiny RCU still does, and the plan is also
> to make tree RCU use call_rcu() for SLUB_TINY configurations.
> 
> Instead of teaching tree RCU again to special case the offsets, let's
> remove the special casing completely. Since there's no SLOB anymore, it
> is possible to create a callback function that can take a pointer to a
> middle of slab object with unknown offset and determine the object's
> pointer before freeing it, so implement that as kvfree_rcu_cb().
> 
> Large kmalloc and vmalloc allocations are handled simply by aligning
> down to page size. For that we retain the requirement that the offset is
> smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely
> and instead just opencode the condition in the BUILD_BUG_ON() check.
> 
> Signed-off-by: Vlastimil Babka <vba...@suse.cz>
> ---
>  include/linux/rcupdate.h | 24 +++++++++---------------
>  kernel/rcu/tiny.c        | 13 -------------
>  mm/slab.h                |  2 ++
>  mm/slab_common.c         |  5 +----
>  mm/slub.c                | 42 ++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 54 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 
> 3f70d1c8144426f40553c8c589f07097ece8a706..7ff16a70ca1c0fb1012c4118388f60687c5e5b3f
>  100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1025,12 +1025,6 @@ static inline notrace void 
> rcu_read_unlock_sched_notrace(void)
>  #define RCU_POINTER_INITIALIZER(p, v) \
>               .p = RCU_INITIALIZER(v)
>  
> -/*
> - * Does the specified offset indicate that the corresponding rcu_head
> - * structure can be handled by kvfree_rcu()?
> - */
> -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
> -
>  /**
>   * kfree_rcu() - kfree an object after a grace period.
>   * @ptr: pointer to kfree for double-argument invocations.
> @@ -1041,11 +1035,11 @@ static inline notrace void 
> rcu_read_unlock_sched_notrace(void)
>   * when they are used in a kernel module, that module must invoke the
>   * high-latency rcu_barrier() function at module-unload time.
>   *
> - * The kfree_rcu() function handles this issue.  Rather than encoding a
> - * function address in the embedded rcu_head structure, kfree_rcu() instead
> - * encodes the offset of the rcu_head structure within the base structure.
> - * Because the functions are not allowed in the low-order 4096 bytes of
> - * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
> + * The kfree_rcu() function handles this issue. In order to have a universal
> + * callback function handling different offsets of rcu_head, the callback 
> needs
> + * to determine the starting address of the freed object, which can be a 
> large
> + * kmalloc of vmalloc allocation. To allow simply aligning the pointer down 
> to
> + * page boundary for those, only offsets up to 4095 bytes can be 
> accommodated.
>   * If the offset is larger than 4095 bytes, a compile-time error will
>   * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
>   * either fall back to use of call_rcu() or rearrange the structure to
> @@ -1091,10 +1085,10 @@ void kvfree_call_rcu(struct rcu_head *head, void 
> *ptr);
>  do {                                                                 \
>       typeof (ptr) ___p = (ptr);                                      \
>                                                                       \
> -     if (___p) {                                                             
>         \
> -             BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), 
> rhf)));   \
> -             kvfree_call_rcu(&((___p)->rhf), (void *) (___p));               
>         \
> -     }                                                                       
>         \
> +     if (___p) {                                                     \
> +             BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);    \
> +             kvfree_call_rcu(&((___p)->rhf), (void *) (___p));       \
> +     }                                                               \
>
Why removing the macro? At least __is_kvfree_rcu_offset() makes it
clear what and why + it has a nice comment what it is used for. 4096
looks like hard-coded value.

Or you do not want that someone else started to use that macro in
another places?

>  } while (0)
>  
>  #define kvfree_rcu_arg_1(ptr)                                        \
> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
> index 
> 0ec27093d0e14a4b1060ea08932c4ac13f9b0f26..77e0db0221364376a99ebeb17485650879385a6e
>  100644
> --- a/kernel/rcu/tiny.c
> +++ b/kernel/rcu/tiny.c
> @@ -88,12 +88,6 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
>       unsigned long offset = (unsigned long)head->func;
>  
>       rcu_lock_acquire(&rcu_callback_map);
> -     if (__is_kvfree_rcu_offset(offset)) {
> -             trace_rcu_invoke_kvfree_callback("", head, offset);
> -             kvfree((void *)head - offset);
> -             rcu_lock_release(&rcu_callback_map);
> -             return true;
> -     }
>  
>       trace_rcu_invoke_callback("", head);
>       f = head->func;
> @@ -159,10 +153,6 @@ void synchronize_rcu(void)
>  }
>  EXPORT_SYMBOL_GPL(synchronize_rcu);
>  
> -static void tiny_rcu_leak_callback(struct rcu_head *rhp)
> -{
> -}
> -
>  /*
>   * Post an RCU callback to be invoked after the end of an RCU grace
>   * period.  But since we have but one CPU, that would be after any
> @@ -178,9 +168,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>                       pr_err("%s(): Double-freed CB %p->%pS()!!!  ", 
> __func__, head, head->func);
>                       mem_dump_obj(head);
>               }
> -
> -             if (!__is_kvfree_rcu_offset((unsigned long)head->func))
> -                     WRITE_ONCE(head->func, tiny_rcu_leak_callback);
>               return;
>       }
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index 
> e9fd9bf0bfa65b343a4ae0ecd5b4c2a325b04883..2f01c7317988ce036f0b22807403226a59f0f708
>  100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct 
> slab *slab,
>                           void **p, int objects, struct slabobj_ext 
> *obj_exts);
>  #endif
>  
> +void kvfree_rcu_cb(struct rcu_head *head);
> +
>  size_t __ksize(const void *objp);
>  
>  static inline size_t slab_ksize(const struct kmem_cache *s)
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 
> 330cdd8ebc5380090ee784c58e8ca1d1a52b3758..f13d2c901daf1419993620459fbd5845eecb85f1
>  100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1532,9 +1532,6 @@ kvfree_rcu_list(struct rcu_head *head)
>               rcu_lock_acquire(&rcu_callback_map);
>               trace_rcu_invoke_kvfree_callback("slab", head, offset);
>  
> -             if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
> -                     kvfree(ptr);
> -
This is not correct unless i miss something. Why do you remove this?

>  
> diff --git a/mm/slub.c b/mm/slub.c
> index 
> c2151c9fee228d121a9cbcc220c3ae054769dacf..651381bf05566e88de8493e0550f121d23b757a1
>  100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -19,6 +19,7 @@
>  #include <linux/bitops.h>
>  #include <linux/slab.h>
>  #include "slab.h"
> +#include <linux/vmalloc.h>
>  #include <linux/proc_fs.h>
>  #include <linux/seq_file.h>
>  #include <linux/kasan.h>
> @@ -4732,6 +4733,47 @@ static void free_large_kmalloc(struct folio *folio, 
> void *object)
>       folio_put(folio);
>  }
>  
> +void kvfree_rcu_cb(struct rcu_head *head)
> +{
> +     void *obj = head;
> +     struct folio *folio;
> +     struct slab *slab;
> +     struct kmem_cache *s;
> +     void *slab_addr;
> +
> +     if (unlikely(is_vmalloc_addr(obj))) {
> +             obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> +             vfree(obj);
> +             return;
> +     }
> +
> +     folio = virt_to_folio(obj);
> +     if (unlikely(!folio_test_slab(folio))) {
> +             /*
> +              * rcu_head offset can be only less than page size so no need to
> +              * consider folio order
> +              */
> +             obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> +             free_large_kmalloc(folio, obj);
> +             return;
> +     }
> +
> +     slab = folio_slab(folio);
> +     s = slab->slab_cache;
> +     slab_addr = folio_address(folio);
> +
> +     if (is_kfence_address(obj)) {
> +             obj = kfence_object_start(obj);
> +     } else {
> +             unsigned int idx = __obj_to_index(s, slab_addr, obj);
> +
> +             obj = slab_addr + s->size * idx;
> +             obj = fixup_red_left(s, obj);
> +     }
> +
> +     slab_free(s, slab, obj, _RET_IP_);
> +}
> +
Tiny computer case. Just small nit, maybe remove unlikely() but you decide :)

--
Uladzislau Rezki

Reply via email to