On Wed, Sep 10, 2025 at 1:01 AM Vlastimil Babka <vba...@suse.cz> wrote: > > Since we don't control the NUMA locality of objects in percpu sheaves, > allocations with node restrictions bypass them. Allocations without > restrictions may however still expect to get local objects with high > probability, and the introduction of sheaves can decrease it due to > freed object from a remote node ending up in percpu sheaves. > > The fraction of such remote frees seems low (5% on an 8-node machine) > but it can be expected that some cache or workload specific corner cases > exist. We can either conclude that this is not a problem due to the low > fraction, or we can make remote frees bypass percpu sheaves and go > directly to their slabs. This will make the remote frees more expensive, > but if if's only a small fraction, most frees will still benefit from
s/if's/it's > the lower overhead of percpu sheaves. > > This patch thus makes remote object freeing bypass percpu sheaves, > including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However > it's not intended to be 100% guarantee that percpu sheaves will only > contain local objects. The refill from slabs does not provide that > guarantee in the first place, and there might be cpu migrations > happening when we need to unlock the local_lock. Avoiding all that could > be possible but complicated so we can leave it for later investigation > whether it would be worth it. It can be expected that the more selective > freeing will itself prevent accumulation of remote objects in percpu > sheaves so any such violations would have only short-term effects. > > Reviewed-by: Harry Yoo <harry....@oracle.com> > Signed-off-by: Vlastimil Babka <vba...@suse.cz> Reviewed-by: Suren Baghdasaryan <sur...@google.com> > --- > mm/slab_common.c | 7 +++++-- > mm/slub.c | 42 ++++++++++++++++++++++++++++++++++++------ > 2 files changed, 41 insertions(+), 8 deletions(-) > > diff --git a/mm/slab_common.c b/mm/slab_common.c > index > 005a4319c06a01d2b616a75396fcc43766a62ddb..b6601e0fe598e24bd8d456dce4fc82c65b342bfd > 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -1623,8 +1623,11 @@ static bool kfree_rcu_sheaf(void *obj) > > slab = folio_slab(folio); > s = slab->slab_cache; > - if (s->cpu_sheaves) > - return __kfree_rcu_sheaf(s, obj); > + if (s->cpu_sheaves) { > + if (likely(!IS_ENABLED(CONFIG_NUMA) || > + slab_nid(slab) == numa_mem_id())) > + return __kfree_rcu_sheaf(s, obj); > + } > > return false; > } > diff --git a/mm/slub.c b/mm/slub.c > index > 35274ce4e709c9da7ac8f9006c824f28709e923d..9699d048b2cd08ee75c4cc3d1e460868704520b1 > 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -472,6 +472,7 @@ struct slab_sheaf { > }; > struct kmem_cache *cache; > unsigned int size; > + int node; /* only used for rcu_sheaf */ > void *objects[]; > }; > > @@ -5828,7 +5829,7 @@ static void rcu_free_sheaf(struct rcu_head *head) > */ > __rcu_free_sheaf_prepare(s, sheaf); > > - barn = get_node(s, numa_mem_id())->barn; > + barn = get_node(s, sheaf->node)->barn; > > /* due to slab_free_hook() */ > if (unlikely(sheaf->size == 0)) > @@ -5914,10 +5915,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void > *obj) > > rcu_sheaf->objects[rcu_sheaf->size++] = obj; > > - if (likely(rcu_sheaf->size < s->sheaf_capacity)) > + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { > rcu_sheaf = NULL; > - else > + } else { > pcs->rcu_free = NULL; > + rcu_sheaf->node = numa_mem_id(); > + } > > local_unlock(&s->cpu_sheaves->lock); > > @@ -5944,7 +5947,11 @@ static void free_to_pcs_bulk(struct kmem_cache *s, > size_t size, void **p) > bool init = slab_want_init_on_free(s); > unsigned int batch, i = 0; > struct node_barn *barn; > + void *remote_objects[PCS_BATCH_MAX]; > + unsigned int remote_nr = 0; > + int node = numa_mem_id(); > > +next_remote_batch: > while (i < size) { > struct slab *slab = virt_to_slab(p[i]); > > @@ -5954,7 +5961,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, > size_t size, void **p) > if (unlikely(!slab_free_hook(s, p[i], init, false))) { > p[i] = p[--size]; > if (!size) > - return; > + goto flush_remote; > + continue; > + } > + > + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != > node)) { > + remote_objects[remote_nr] = p[i]; > + p[i] = p[--size]; > + if (++remote_nr >= PCS_BATCH_MAX) > + goto flush_remote; > continue; > } > > @@ -6024,6 +6039,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, > size_t size, void **p) > */ > fallback: > __kmem_cache_free_bulk(s, size, p); > + > +flush_remote: > + if (remote_nr) { > + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); > + if (i < size) { > + remote_nr = 0; > + goto next_remote_batch; > + } > + } > } > > #ifndef CONFIG_SLUB_TINY > @@ -6115,8 +6139,14 @@ void slab_free(struct kmem_cache *s, struct slab > *slab, void *object, > if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), > false))) > return; > > - if (!s->cpu_sheaves || !free_to_pcs(s, object)) > - do_slab_free(s, slab, object, object, 1, addr); > + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || > + slab_nid(slab) == numa_mem_id())) { > + if (likely(free_to_pcs(s, object))) { nit: no need for curly braces here. > + return; > + } > + } > + > + do_slab_free(s, slab, object, object, 1, addr); > } > > #ifdef CONFIG_MEMCG > > -- > 2.51.0 >