On Fri, 6 Mar 2026 20:55:20 -0800 "JP Kobryn (Meta)" <[email protected]> wrote:
> When investigating pressure on a NUMA node, there is no straightforward way > to determine which policies are driving allocations to it. > > Add per-policy page allocation counters as new node stat items. These > counters track allocations to nodes and also whether the allocations were > intentional or fallbacks. > > The new stats follow the existing numa hit/miss/foreign style and have the > following meanings: > > hit > - for BIND and PREFERRED_MANY, allocation succeeded on node in nodemask > - for other policies, allocation succeeded on intended node > - counted on the node of the allocation > miss > - allocation intended for other node, but happened on this one > - counted on other node > foreign > - allocation intended on this node, but happened on other node > - counted on this node > > Counters are exposed per-memcg, per-node in memory.numa_stat and globally > in /proc/vmstat. > > Signed-off-by: JP Kobryn (Meta) <[email protected]> > --- > v2: > - Replaced single per-policy total counter (PGALLOC_MPOL_*) with > hit/miss/foreign triplet per policy > - Changed from global node stats to per-memcg per-node tracking > > v1: > https://lore.kernel.org/linux-mm/[email protected]/ > > include/linux/mmzone.h | 20 ++++++++++ > mm/memcontrol.c | 60 ++++++++++++++++++++++++++++ > mm/mempolicy.c | 90 ++++++++++++++++++++++++++++++++++++++++-- > mm/vmstat.c | 20 ++++++++++ > 4 files changed, 187 insertions(+), 3 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 7bd0134c241c..c0517cbcb0e2 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -323,6 +323,26 @@ enum node_stat_item { > PGSCAN_ANON, > PGSCAN_FILE, > PGREFILL, > +#ifdef CONFIG_NUMA > + NUMA_MPOL_LOCAL_HIT, > + NUMA_MPOL_LOCAL_MISS, > + NUMA_MPOL_LOCAL_FOREIGN, > + NUMA_MPOL_PREFERRED_HIT, > + NUMA_MPOL_PREFERRED_MISS, > + NUMA_MPOL_PREFERRED_FOREIGN, > + NUMA_MPOL_PREFERRED_MANY_HIT, > + NUMA_MPOL_PREFERRED_MANY_MISS, > + NUMA_MPOL_PREFERRED_MANY_FOREIGN, > + NUMA_MPOL_BIND_HIT, > + NUMA_MPOL_BIND_MISS, > + NUMA_MPOL_BIND_FOREIGN, > + NUMA_MPOL_INTERLEAVE_HIT, > + NUMA_MPOL_INTERLEAVE_MISS, > + NUMA_MPOL_INTERLEAVE_FOREIGN, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_HIT, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_MISS, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_FOREIGN, > +#endif > #ifdef CONFIG_HUGETLB_PAGE > NR_HUGETLB, > #endif > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 982231a078f2..4d29f723a2de 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -420,6 +420,26 @@ static const unsigned int memcg_node_stat_items[] = { > PGSCAN_ANON, > PGSCAN_FILE, > PGREFILL, > +#ifdef CONFIG_NUMA > + NUMA_MPOL_LOCAL_HIT, > + NUMA_MPOL_LOCAL_MISS, > + NUMA_MPOL_LOCAL_FOREIGN, > + NUMA_MPOL_PREFERRED_HIT, > + NUMA_MPOL_PREFERRED_MISS, > + NUMA_MPOL_PREFERRED_FOREIGN, > + NUMA_MPOL_PREFERRED_MANY_HIT, > + NUMA_MPOL_PREFERRED_MANY_MISS, > + NUMA_MPOL_PREFERRED_MANY_FOREIGN, > + NUMA_MPOL_BIND_HIT, > + NUMA_MPOL_BIND_MISS, > + NUMA_MPOL_BIND_FOREIGN, > + NUMA_MPOL_INTERLEAVE_HIT, > + NUMA_MPOL_INTERLEAVE_MISS, > + NUMA_MPOL_INTERLEAVE_FOREIGN, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_HIT, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_MISS, > + NUMA_MPOL_WEIGHTED_INTERLEAVE_FOREIGN, > +#endif > #ifdef CONFIG_HUGETLB_PAGE > NR_HUGETLB, > #endif > @@ -1591,6 +1611,26 @@ static const struct memory_stat memory_stats[] = { > #ifdef CONFIG_NUMA_BALANCING > { "pgpromote_success", PGPROMOTE_SUCCESS }, > #endif > +#ifdef CONFIG_NUMA > + { "numa_mpol_local_hit", NUMA_MPOL_LOCAL_HIT > }, > + { "numa_mpol_local_miss", NUMA_MPOL_LOCAL_MISS > }, > + { "numa_mpol_local_foreign", NUMA_MPOL_LOCAL_FOREIGN > }, > + { "numa_mpol_preferred_hit", NUMA_MPOL_PREFERRED_HIT > }, > + { "numa_mpol_preferred_miss", NUMA_MPOL_PREFERRED_MISS > }, > + { "numa_mpol_preferred_foreign", NUMA_MPOL_PREFERRED_FOREIGN > }, > + { "numa_mpol_preferred_many_hit", NUMA_MPOL_PREFERRED_MANY_HIT > }, > + { "numa_mpol_preferred_many_miss", NUMA_MPOL_PREFERRED_MANY_MISS > }, > + { "numa_mpol_preferred_many_foreign", > NUMA_MPOL_PREFERRED_MANY_FOREIGN }, > + { "numa_mpol_bind_hit", NUMA_MPOL_BIND_HIT > }, > + { "numa_mpol_bind_miss", NUMA_MPOL_BIND_MISS > }, > + { "numa_mpol_bind_foreign", NUMA_MPOL_BIND_FOREIGN > }, > + { "numa_mpol_interleave_hit", NUMA_MPOL_INTERLEAVE_HIT > }, > + { "numa_mpol_interleave_miss", NUMA_MPOL_INTERLEAVE_MISS > }, > + { "numa_mpol_interleave_foreign", NUMA_MPOL_INTERLEAVE_FOREIGN > }, > + { "numa_mpol_weighted_interleave_hit", > NUMA_MPOL_WEIGHTED_INTERLEAVE_HIT }, > + { "numa_mpol_weighted_interleave_miss", > NUMA_MPOL_WEIGHTED_INTERLEAVE_MISS }, > + { "numa_mpol_weighted_interleave_foreign", > NUMA_MPOL_WEIGHTED_INTERLEAVE_FOREIGN }, > +#endif > }; > > /* The actual unit of the state item, not the same as the output unit */ > @@ -1642,6 +1682,26 @@ static int memcg_page_state_output_unit(int item) > case PGREFILL: > #ifdef CONFIG_NUMA_BALANCING > case PGPROMOTE_SUCCESS: > +#endif > +#ifdef CONFIG_NUMA > + case NUMA_MPOL_LOCAL_HIT: > + case NUMA_MPOL_LOCAL_MISS: > + case NUMA_MPOL_LOCAL_FOREIGN: > + case NUMA_MPOL_PREFERRED_HIT: > + case NUMA_MPOL_PREFERRED_MISS: > + case NUMA_MPOL_PREFERRED_FOREIGN: > + case NUMA_MPOL_PREFERRED_MANY_HIT: > + case NUMA_MPOL_PREFERRED_MANY_MISS: > + case NUMA_MPOL_PREFERRED_MANY_FOREIGN: > + case NUMA_MPOL_BIND_HIT: > + case NUMA_MPOL_BIND_MISS: > + case NUMA_MPOL_BIND_FOREIGN: > + case NUMA_MPOL_INTERLEAVE_HIT: > + case NUMA_MPOL_INTERLEAVE_MISS: > + case NUMA_MPOL_INTERLEAVE_FOREIGN: > + case NUMA_MPOL_WEIGHTED_INTERLEAVE_HIT: > + case NUMA_MPOL_WEIGHTED_INTERLEAVE_MISS: > + case NUMA_MPOL_WEIGHTED_INTERLEAVE_FOREIGN: > #endif > return 1; > default: > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 0e5175f1c767..2417de75098d 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -117,6 +117,7 @@ > #include <asm/tlb.h> > #include <linux/uaccess.h> > #include <linux/memory.h> > +#include <linux/memcontrol.h> > > #include "internal.h" > > @@ -2426,6 +2427,83 @@ static struct page *alloc_pages_preferred_many(gfp_t > gfp, unsigned int order, > return page; > } > > +/* > + * Count a mempolicy allocation. Stats are tracked per-node and per-cgroup. > + * The following numa_{hit/miss/foreign} pattern is used: > + * > + * hit > + * - for BIND and PREFERRED_MANY, allocation succeeded on node in > nodemask > + * - for other policies, allocation succeeded on intended node > + * - counted on the node of the allocation > + * miss > + * - allocation intended for other node, but happened on this one > + * - counted on other node > + * foreign > + * - allocation intended on this node, but happened on other node > + * - counted on this node > + */ > +static void mpol_count_numa_alloc(struct mempolicy *pol, int intended_nid, > + struct page *page, unsigned int order) > +{ > + int actual_nid = page_to_nid(page); > + long nr_pages = 1L << order; > + enum node_stat_item hit_idx; > + struct mem_cgroup *memcg; > + struct lruvec *lruvec; > + bool is_hit; > + > + if (!root_mem_cgroup || mem_cgroup_disabled()) > + return; Hello JP! The stats are exposed via /proc/vmstat and are guarded by CONFIG_NUMA, not CONFIG_MEMCG. Early returning overhere would make it inaccuate. Does it make sense to use mod_node_page_state if memcg is not available, so that these global counters work regardless of cgroup configuration. > + > + /* > + * Start with hit then use +1 or +2 later on to change to miss or > + * foreign respectively if needed. > + */ > + switch (pol->mode) { > + case MPOL_PREFERRED: > + hit_idx = NUMA_MPOL_PREFERRED_HIT; > + break; > + case MPOL_PREFERRED_MANY: > + hit_idx = NUMA_MPOL_PREFERRED_MANY_HIT; > + break; > + case MPOL_BIND: > + hit_idx = NUMA_MPOL_BIND_HIT; > + break; > + case MPOL_INTERLEAVE: > + hit_idx = NUMA_MPOL_INTERLEAVE_HIT; > + break; > + case MPOL_WEIGHTED_INTERLEAVE: > + hit_idx = NUMA_MPOL_WEIGHTED_INTERLEAVE_HIT; > + break; > + default: > + hit_idx = NUMA_MPOL_LOCAL_HIT; > + break; > + } > + > + if (pol->mode == MPOL_BIND || pol->mode == MPOL_PREFERRED_MANY) > + is_hit = node_isset(actual_nid, pol->nodes); > + else > + is_hit = (actual_nid == intended_nid); > + > + rcu_read_lock(); > + memcg = mem_cgroup_from_task(current); > + > + if (is_hit) { > + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(actual_nid)); > + mod_lruvec_state(lruvec, hit_idx, nr_pages); > + } else { > + /* account for miss on the fallback node */ > + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(actual_nid)); > + mod_lruvec_state(lruvec, hit_idx + 1, nr_pages); > + > + /* account for foreign on the intended node */ > + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(intended_nid)); > + mod_lruvec_state(lruvec, hit_idx + 2, nr_pages); > + } > + > + rcu_read_unlock(); > +} > + > /** > * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. > * @gfp: GFP flags.

