Add a root-only cgroupfs file "dmem.memcg" that lets an administrator configure whether allocations in a dmem region should also be charged to the memory controller.
To handle inheritance, dmem adds a depends_on the memory controller, unless MEMCG isn't configured in. Double-charging is disabled by default. Once a charge is attempted, the setting is locked to prevent inconsistent accounting by a small 4-state machine (off, on, locked off, locked on). The memcg to charge is derived from the pool's cgroup, since the pool holds a reference to the dmem cgroup state that keeps the cgroup alive until it gets uncharged. Signed-off-by: Eric Chanudet <[email protected]> --- Documentation/admin-guide/cgroup-v2.rst | 23 +++++ kernel/cgroup/dmem.c | 158 +++++++++++++++++++++++++++++++- 2 files changed, 178 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 6efd0095ed995b1550317662bc1b56c7a7f3db23..1d2fa55ddf0faa17baa916a8914d3033e8e42359 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2828,6 +2828,29 @@ DMEM Interface Files drm/0000:03:00.0/vram0 12550144 drm/0000:03:00.0/stolen 8650752 + dmem.memcg + A readwrite nested-keyed file that exists only on the root + cgroup. It configures whether allocations in a dmem region + should also be charged to the memory controller. + + Upon the first charge to a region, its setting can no longer be changed + and is reported as "[true|false] (locked)". + + Charges to the memory controller are visible in ``memory.stat`` as the + ``dmem`` entry, reported in bytes. + + An example read output follows:: + + drm/0000:03:00.0/vram0 false + drm/0000:03:00.0/stolen false (locked) + + Writing uses the same nested-keyed format:: + + echo "drm/0000:03:00.0/vram0 true" > dmem.memcg + + This file is only available when the kernel is built with + ``CONFIG_MEMCG``. + HugeTLB ------- diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f2711ecc60dd13e611a8a4920b48f3e9..e07b20b8025c528f190f84c76b088cb8a32a7f5e 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -17,6 +17,14 @@ #include <linux/refcount.h> #include <linux/rculist.h> #include <linux/slab.h> +#include <linux/memcontrol.h> + +enum dmem_memcg_status { + DMEM_MEMCG_OFF, + DMEM_MEMCG_ON, + DMEM_MEMCG_LOCKED_OFF, + DMEM_MEMCG_LOCKED_ON, +}; struct dmem_cgroup_region { /** @@ -51,6 +59,14 @@ struct dmem_cgroup_region { * No new pools should be added to the region afterwards. */ bool unregistered; + + /** + * @memcg_status: Whether allocation in this region should charge memcg. + * DMEM_MEMCG_OFF/DMEM_MEMCG_ON or + * DMEM_MEMCG_LOCKED_OFF/DMEM_MEMCG_LOCKED_ON, frozen after first allocation. + * Transitions to a locked state are one-way. + */ + atomic_t memcg_status; }; struct dmemcg_state { @@ -609,6 +625,34 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) return pool; } +static bool apply_memcg_charge(atomic_t *status) +{ + int state = atomic_read(status); + + for (;;) { + switch (state) { + case DMEM_MEMCG_OFF: + state = atomic_cmpxchg(status, DMEM_MEMCG_OFF, + DMEM_MEMCG_LOCKED_OFF); + if (state != DMEM_MEMCG_OFF) + continue; + return false; + case DMEM_MEMCG_LOCKED_OFF: + return false; + case DMEM_MEMCG_ON: + state = atomic_cmpxchg(status, DMEM_MEMCG_ON, + DMEM_MEMCG_LOCKED_ON); + if (state != DMEM_MEMCG_ON) + continue; + return true; + case DMEM_MEMCG_LOCKED_ON: + return true; + } + WARN_ONCE(1, "Invalid memcg_status (%#x).\n", state); + return false; + } +} + /** * dmem_cgroup_uncharge() - Uncharge a pool. * @pool: Pool to uncharge. @@ -624,6 +668,12 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) return; page_counter_uncharge(&pool->cnt, size); + + if (atomic_read(&pool->region->memcg_status) == DMEM_MEMCG_LOCKED_ON && + !WARN_ON_ONCE(size > (u64)UINT_MAX << PAGE_SHIFT)) + mem_cgroup_dmem_uncharge(pool->cs->css.cgroup, + PAGE_ALIGN(size) >> PAGE_SHIFT); + css_put(&pool->cs->css); dmemcg_pool_put(pool); } @@ -655,6 +705,8 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, struct dmemcg_state *cg; struct dmem_cgroup_pool_state *pool; struct page_counter *fail; + unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + bool charge_memcg; int ret; *ret_pool = NULL; @@ -670,7 +722,28 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, pool = get_cg_pool_unlocked(cg, region); if (IS_ERR(pool)) { ret = PTR_ERR(pool); - goto err; + goto err_css_put; + } + + charge_memcg = apply_memcg_charge(®ion->memcg_status); + if (charge_memcg) { + /* mem_cgroup_dmem_charge limitation from try_charge_memcg */ + if (size > (u64)UINT_MAX << PAGE_SHIFT) { + ret = -EINVAL; + dmemcg_pool_put(pool); + goto err_css_put; + } + + if (!mem_cgroup_dmem_charge(pool->cs->css.cgroup, nr_pages, + GFP_KERNEL)) { + /* + * No dmem_cgroup_state_evict_valuable() could help, + * there's no ret_limit_pool to return. + */ + ret = -ENOMEM; + dmemcg_pool_put(pool); + goto err_css_put; + } } if (!page_counter_try_charge(&pool->cnt, size, &fail)) { @@ -681,14 +754,17 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, } dmemcg_pool_put(pool); ret = -EAGAIN; - goto err; + goto err_uncharge_memcg; } /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ *ret_pool = pool; return 0; -err: +err_uncharge_memcg: + if (charge_memcg) + mem_cgroup_dmem_uncharge(pool->cs->css.cgroup, nr_pages); +err_css_put: css_put(&cg->css); return ret; } @@ -845,6 +921,71 @@ static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); } +#ifdef CONFIG_MEMCG +static int dmem_cgroup_memcg_show(struct seq_file *sf, void *v) +{ + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + int state = atomic_read(®ion->memcg_status); + + seq_printf(sf, "%s %s\n", region->name, + state == DMEM_MEMCG_ON ? "true" : + state == DMEM_MEMCG_OFF ? "false" : + state == DMEM_MEMCG_LOCKED_ON ? "true (locked)" : + state == DMEM_MEMCG_LOCKED_OFF ? "false (locked)" : + "(invalid)"); + } + rcu_read_unlock(); + return 0; +} + +static ssize_t dmem_cgroup_memcg_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + while (buf) { + struct dmem_cgroup_region *region; + char *options, *name; + bool flag; + + options = buf; + buf = strchr(buf, '\n'); + if (buf) + *buf++ = '\0'; + + options = strstrip(options); + if (!options[0]) + continue; + + name = strsep(&options, " \t"); + if (!name[0]) + continue; + + if (!options || !options[0]) + return -EINVAL; + + if (kstrtobool(options, &flag)) + return -EINVAL; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(name); + rcu_read_unlock(); + if (!region) + return -ENODEV; + + atomic_cmpxchg(®ion->memcg_status, + flag ? DMEM_MEMCG_OFF : DMEM_MEMCG_ON, + flag ? DMEM_MEMCG_ON : DMEM_MEMCG_OFF); + /* Continue if a region is already locked. */ + + kref_put(®ion->ref, dmemcg_free_region); + } + + return nbytes; +} +#endif + static struct cftype files[] = { { .name = "capacity", @@ -873,6 +1014,14 @@ static struct cftype files[] = { .seq_show = dmem_cgroup_region_max_show, .flags = CFTYPE_NOT_ON_ROOT, }, +#ifdef CONFIG_MEMCG + { + .name = "memcg", + .write = dmem_cgroup_memcg_write, + .seq_show = dmem_cgroup_memcg_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, +#endif { } /* Zero entry terminates. */ }; @@ -882,4 +1031,7 @@ struct cgroup_subsys dmem_cgrp_subsys = { .css_offline = dmemcs_offline, .legacy_cftypes = files, .dfl_cftypes = files, +#ifdef CONFIG_MEMCG + .depends_on = 1 << memory_cgrp_id, +#endif }; -- 2.52.0
