Hi, On 10/24/25 14:14, Maarten Lankhorst wrote:
Hey,Den 2025-10-15 kl. 15:57, skrev Natalie Vock:When the cgroup's memory usage is below the low/min limit and allocation fails, try evicting some unprotected buffers to make space. Otherwise, application buffers may be forced to go into GTT even though usage is below the corresponding low/min limit, if other applications filled VRAM with their allocations first. Signed-off-by: Natalie Vock <[email protected]> --- drivers/gpu/drm/ttm/ttm_bo.c | 43 ++++++++++++++++++++++++++++------ drivers/gpu/drm/ttm/ttm_resource.c | 48 +++++++++++++++++++++++++++----------- include/drm/ttm/ttm_resource.h | 6 ++++- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 829d99479883594f8be5b9ceed4cc53c4864ace5..7f7872ab2090cc8db188e08ddfdcd12fe924f743 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -490,8 +490,12 @@ int ttm_bo_evict_first(struct ttm_device *bdev, struct ttm_resource_manager *man }struct ttm_bo_alloc_state {+ /** @charge_pool: The memory pool the resource is charged to */ + struct dmem_cgroup_pool_state *charge_pool; /** @limit_pool: Which pool limit we should test against */ struct dmem_cgroup_pool_state *limit_pool; + /** @only_evict_unprotected: If eviction should be restricted to unprotected BOs */ + bool only_evict_unprotected;I'm not entirely sure we should put 'low' and 'min' limits together here.
I think putting 'low' and 'min' together here is accurate. When the allocation is covered by the 'low' limit, but not the 'min' limit, we should evict only allocations that are covered by neither (which is what this flag controls).
However maybe we should allow evicting allocations covered by 'low' when the new allocation is covered by 'min' in ttm_resource_alloc_at_place down below (because 'min' is a stronger guarantee). We could do this simply by setting 'only_evict_unprotected' to false, since memory covered by 'min' can never get evicted anyway.
};/**@@ -546,7 +550,7 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object * evict_walk->evicted++; if (evict_walk->res) lret = ttm_resource_alloc(evict_walk->evictor, evict_walk->place, - evict_walk->res, NULL); + evict_walk->res, evict_walk->alloc_state->charge_pool); if (lret == 0) return 1; out: @@ -589,7 +593,7 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev, lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);/* One more attempt if we hit low limit? */- if (!lret && evict_walk.hit_low) { + if (!lret && evict_walk.hit_low && !state->only_evict_unprotected) { evict_walk.try_low = true; lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1); } @@ -610,7 +614,8 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev, } while (!lret && evict_walk.evicted);/* We hit the low limit? Try once more */- if (!lret && evict_walk.hit_low && !evict_walk.try_low) { + if (!lret && evict_walk.hit_low && !evict_walk.try_low && + !state->only_evict_unprotected) { evict_walk.try_low = true; goto retry; } @@ -719,20 +724,40 @@ static int ttm_bo_alloc_at_place(struct ttm_buffer_object *bo, struct ttm_resource **res, struct ttm_bo_alloc_state *alloc_state) { - bool may_evict; + bool may_evict, is_protected = false; int ret;may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM);+ ret = ttm_resource_try_charge(bo, place, &alloc_state->charge_pool, + force_space ? &alloc_state->limit_pool : NULL); + if (ret) { + /* + * -EAGAIN means the charge failed, which we treat like an + * allocation failure. Allocation failures are indicated + * by -ENOSPC, so return that instead. + */ + if (ret == -EAGAIN && !may_evict) + ret = -ENOSPC; + return ret; + }- ret = ttm_resource_alloc(bo, place, res,- force_space ? &alloc_state->limit_pool : NULL); + is_protected = dmem_cgroup_below_min(NULL, alloc_state->charge_pool) || + dmem_cgroup_below_low(NULL, alloc_state->charge_pool); + ret = ttm_resource_alloc(bo, place, res, alloc_state->charge_pool); + alloc_state->only_evict_unprotected = !may_evict && is_protected;This probably deserves a comment to explaing it's ok if we haven't hit low/min yet to evict from those cgroups that did those limits already. It took me a bit of time to understand the idea.
Yeah, that's a bit non-obvious. I'll add a comment. Thanks, Natalie
if (ret) {- if ((ret == -ENOSPC || ret == -EAGAIN) && may_evict) + if ((ret == -ENOSPC || ret == -EAGAIN) && + (may_evict || is_protected)) ret = -EBUSY; return ret; }+ /*+ * Ownership of charge_pool has been transferred to the TTM resource, + * don't make the caller think we still hold a reference to it. + */ + alloc_state->charge_pool = NULL; return 0; }@@ -787,6 +812,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,res, &alloc_state);if (ret == -ENOSPC) {+ dmem_cgroup_pool_state_put(alloc_state.charge_pool); dmem_cgroup_pool_state_put(alloc_state.limit_pool); continue; } else if (ret == -EBUSY) { @@ -796,11 +822,14 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, dmem_cgroup_pool_state_put(alloc_state.limit_pool);if (ret) {+ dmem_cgroup_pool_state_put( + alloc_state.charge_pool); if (ret != -ENOSPC && ret != -EBUSY) return ret; continue; } } else if (ret) { + dmem_cgroup_pool_state_put(alloc_state.charge_pool); dmem_cgroup_pool_state_put(alloc_state.limit_pool); return ret; } diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index e2c82ad07eb44b5e88bf5b5db1ef54dd6d27823b..fcfa8b51b033745f46a01e40a9dc83e0c69165fc 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -372,30 +372,52 @@ void ttm_resource_fini(struct ttm_resource_manager *man, } EXPORT_SYMBOL(ttm_resource_fini);+/**+ * ttm_resource_try_charge - charge a resource manager's cgroup pool + * @bo: buffer for which an allocation should be charged + * @place: where the allocation is attempted to be placed + * @ret_pool: on charge success, the pool that was charged + * @ret_limit_pool: on charge failure, the pool responsible for the failure + * + * Should be used to charge cgroups before attempting resource allocation. + * When charging succeeds, the value of ret_pool should be passed to + * ttm_resource_alloc. + * + * Returns: 0 on charge success, negative errno on failure. + */ +int ttm_resource_try_charge(struct ttm_buffer_object *bo, + const struct ttm_place *place, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + struct ttm_resource_manager *man = + ttm_manager_type(bo->bdev, place->mem_type); + + if (!man->cg) { + *ret_pool = NULL; + if (ret_limit_pool) + *ret_limit_pool = NULL; + return 0; + } + + return dmem_cgroup_try_charge(man->cg, bo->base.size, ret_pool, + ret_limit_pool); +} + int ttm_resource_alloc(struct ttm_buffer_object *bo, const struct ttm_place *place, struct ttm_resource **res_ptr, - struct dmem_cgroup_pool_state **ret_limit_pool) + struct dmem_cgroup_pool_state *charge_pool) { struct ttm_resource_manager *man = ttm_manager_type(bo->bdev, place->mem_type); - struct dmem_cgroup_pool_state *pool = NULL; int ret;- if (man->cg) {- ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool, ret_limit_pool); - if (ret) - return ret; - } - ret = man->func->alloc(man, bo, place, res_ptr); - if (ret) { - if (pool) - dmem_cgroup_uncharge(pool, bo->base.size); + if (ret) return ret; - }- (*res_ptr)->css = pool;+ (*res_ptr)->css = charge_pool;spin_lock(&bo->bdev->lru_lock);ttm_resource_add_bulk_move(*res_ptr, bo); diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index e52bba15012f78e352f392232ac2e89a83afd311..3aef7efdd7cfb8fd93071db85e632b975b53cf81 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -442,10 +442,14 @@ void ttm_resource_init(struct ttm_buffer_object *bo, void ttm_resource_fini(struct ttm_resource_manager *man, struct ttm_resource *res);+int ttm_resource_try_charge(struct ttm_buffer_object *bo,+ const struct ttm_place *place, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool); int ttm_resource_alloc(struct ttm_buffer_object *bo, const struct ttm_place *place, struct ttm_resource **res, - struct dmem_cgroup_pool_state **ret_limit_pool); + struct dmem_cgroup_pool_state *charge_pool); void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res); bool ttm_resource_intersects(struct ttm_device *bdev, struct ttm_resource *res,
