On Wed, Sep 10, 2025 at 1:01 AM Vlastimil Babka <vba...@suse.cz> wrote:
>
> Since we don't control the NUMA locality of objects in percpu sheaves,
> allocations with node restrictions bypass them. Allocations without
> restrictions may however still expect to get local objects with high
> probability, and the introduction of sheaves can decrease it due to
> freed object from a remote node ending up in percpu sheaves.
>
> The fraction of such remote frees seems low (5% on an 8-node machine)
> but it can be expected that some cache or workload specific corner cases
> exist. We can either conclude that this is not a problem due to the low
> fraction, or we can make remote frees bypass percpu sheaves and go
> directly to their slabs. This will make the remote frees more expensive,
> but if if's only a small fraction, most frees will still benefit from

s/if's/it's

> the lower overhead of percpu sheaves.
>
> This patch thus makes remote object freeing bypass percpu sheaves,
> including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However
> it's not intended to be 100% guarantee that percpu sheaves will only
> contain local objects. The refill from slabs does not provide that
> guarantee in the first place, and there might be cpu migrations
> happening when we need to unlock the local_lock. Avoiding all that could
> be possible but complicated so we can leave it for later investigation
> whether it would be worth it. It can be expected that the more selective
> freeing will itself prevent accumulation of remote objects in percpu
> sheaves so any such violations would have only short-term effects.
>
> Reviewed-by: Harry Yoo <harry....@oracle.com>
> Signed-off-by: Vlastimil Babka <vba...@suse.cz>

Reviewed-by: Suren Baghdasaryan <sur...@google.com>

> ---
>  mm/slab_common.c |  7 +++++--
>  mm/slub.c        | 42 ++++++++++++++++++++++++++++++++++++------
>  2 files changed, 41 insertions(+), 8 deletions(-)
>
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 
> 005a4319c06a01d2b616a75396fcc43766a62ddb..b6601e0fe598e24bd8d456dce4fc82c65b342bfd
>  100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1623,8 +1623,11 @@ static bool kfree_rcu_sheaf(void *obj)
>
>         slab = folio_slab(folio);
>         s = slab->slab_cache;
> -       if (s->cpu_sheaves)
> -               return __kfree_rcu_sheaf(s, obj);
> +       if (s->cpu_sheaves) {
> +               if (likely(!IS_ENABLED(CONFIG_NUMA) ||
> +                          slab_nid(slab) == numa_mem_id()))
> +                       return __kfree_rcu_sheaf(s, obj);
> +       }
>
>         return false;
>  }
> diff --git a/mm/slub.c b/mm/slub.c
> index 
> 35274ce4e709c9da7ac8f9006c824f28709e923d..9699d048b2cd08ee75c4cc3d1e460868704520b1
>  100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -472,6 +472,7 @@ struct slab_sheaf {
>         };
>         struct kmem_cache *cache;
>         unsigned int size;
> +       int node; /* only used for rcu_sheaf */
>         void *objects[];
>  };
>
> @@ -5828,7 +5829,7 @@ static void rcu_free_sheaf(struct rcu_head *head)
>          */
>         __rcu_free_sheaf_prepare(s, sheaf);
>
> -       barn = get_node(s, numa_mem_id())->barn;
> +       barn = get_node(s, sheaf->node)->barn;
>
>         /* due to slab_free_hook() */
>         if (unlikely(sheaf->size == 0))
> @@ -5914,10 +5915,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void 
> *obj)
>
>         rcu_sheaf->objects[rcu_sheaf->size++] = obj;
>
> -       if (likely(rcu_sheaf->size < s->sheaf_capacity))
> +       if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
>                 rcu_sheaf = NULL;
> -       else
> +       } else {
>                 pcs->rcu_free = NULL;
> +               rcu_sheaf->node = numa_mem_id();
> +       }
>
>         local_unlock(&s->cpu_sheaves->lock);
>
> @@ -5944,7 +5947,11 @@ static void free_to_pcs_bulk(struct kmem_cache *s, 
> size_t size, void **p)
>         bool init = slab_want_init_on_free(s);
>         unsigned int batch, i = 0;
>         struct node_barn *barn;
> +       void *remote_objects[PCS_BATCH_MAX];
> +       unsigned int remote_nr = 0;
> +       int node = numa_mem_id();
>
> +next_remote_batch:
>         while (i < size) {
>                 struct slab *slab = virt_to_slab(p[i]);
>
> @@ -5954,7 +5961,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, 
> size_t size, void **p)
>                 if (unlikely(!slab_free_hook(s, p[i], init, false))) {
>                         p[i] = p[--size];
>                         if (!size)
> -                               return;
> +                               goto flush_remote;
> +                       continue;
> +               }
> +
> +               if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != 
> node)) {
> +                       remote_objects[remote_nr] = p[i];
> +                       p[i] = p[--size];
> +                       if (++remote_nr >= PCS_BATCH_MAX)
> +                               goto flush_remote;
>                         continue;
>                 }
>
> @@ -6024,6 +6039,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, 
> size_t size, void **p)
>          */
>  fallback:
>         __kmem_cache_free_bulk(s, size, p);
> +
> +flush_remote:
> +       if (remote_nr) {
> +               __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
> +               if (i < size) {
> +                       remote_nr = 0;
> +                       goto next_remote_batch;
> +               }
> +       }
>  }
>
>  #ifndef CONFIG_SLUB_TINY
> @@ -6115,8 +6139,14 @@ void slab_free(struct kmem_cache *s, struct slab 
> *slab, void *object,
>         if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), 
> false)))
>                 return;
>
> -       if (!s->cpu_sheaves || !free_to_pcs(s, object))
> -               do_slab_free(s, slab, object, object, 1, addr);
> +       if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
> +                                    slab_nid(slab) == numa_mem_id())) {
> +               if (likely(free_to_pcs(s, object))) {

nit: no need for curly braces here.

> +                       return;
> +               }
> +       }
> +
> +       do_slab_free(s, slab, object, object, 1, addr);
>  }
>
>  #ifdef CONFIG_MEMCG
>
> --
> 2.51.0
>

Reply via email to