Register the VRAM manager with the dmem cgroup reclaim infrastructure
so that lowering dmem.max below current VRAM usage triggers TTM
eviction rather than failing with -EBUSY.

Guard place->flags in amdgpu_ttm_bo_eviction_valuable() against NULL,
as the TTM reclaim path passes a NULL place in cgroup drain mode.

v3:
- Rebased on fix for uninitialized list and buddy allocator on the
  drmm_cgroup_register_region() error path.

v5:
- Rebased on the introduction of struct dmem_cgroup_init.
- Clear the reclaim callback in amdgpu_vram_mgr_fini() to prevent
  use-after-free if cgroup reclaim is triggered after driver unbind
  while userspace holds an open DRM file descriptor. (Sashiko-bot)
- Switch from drmm_cgroup_register_region() to the raw
  dmem_cgroup_register_region() and store the region in
  amdgpu_vram_mgr.cg_region. Call dmem_cgroup_unregister_region()
  in amdgpu_vram_mgr_fini() after ttm_resource_manager_evict_all()
  to drain in-flight reclaim callbacks, and clear man->cg afterwards.
  This is required because amdgpu's vram manager fini is called
  explicitly during driver unbind, which may precede the DRM device
  release and thus precede any drmm-based cleanup. (Sashiko-bot)

v6:
- Fix mgr->cg_region never being assigned, so
  dmem_cgroup_unregister_region() in fini silently no-ops on NULL
  and leaks the region. (Sashiko-bot)
- Reorder fini to call set_used(false) and evict_all() before
  dmem_cgroup_unregister_region(), so ttm_resource_free() can
  uncharge via man->cg during eviction; clear man->cg after
  unregister. (Sashiko-bot)

Assisted-by: GitHub_Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 31 ++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h |  2 ++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2740de94e93c..8cbcd33f51a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1488,7 +1488,7 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
        dma_resv_for_each_fence(&resv_cursor, bo->base.resv,
                                DMA_RESV_USAGE_BOOKKEEP, f) {
                if (amdkfd_fence_check_mm(f, current->mm) &&
-                   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
+                   !(place && (place->flags & TTM_PL_FLAG_CONTIGUOUS)))
                        return false;
        }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 08f05c3aed1d..2250bab0970d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -906,6 +906,10 @@ static const struct ttm_resource_manager_func 
amdgpu_vram_mgr_func = {
        .debug  = amdgpu_vram_mgr_debug
 };
 
+static const struct dmem_cgroup_ops amdgpu_vram_mgr_dmem_ops = {
+       .reclaim = ttm_resource_manager_dmem_reclaim,
+};
+
 /**
  * amdgpu_vram_mgr_init - init VRAM manager and DRM MM
  *
@@ -917,6 +921,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
 {
        struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
        struct ttm_resource_manager *man = &mgr->manager;
+       struct dmem_cgroup_region *cg;
        int err;
 
        ttm_resource_manager_init(man, &adev->mman.bdev,
@@ -933,12 +938,16 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
        if (err)
                return err;
 
-       man->cg = drmm_cgroup_register_region(adev_to_drm(adev), "vram",
-                                             &(struct dmem_cgroup_init){
-                                               .size = 
adev->gmc.real_vram_size,
-                                             });
-       if (IS_ERR(man->cg))
-               return PTR_ERR(man->cg);
+       cg = dmem_cgroup_register_region(&(struct dmem_cgroup_init){
+                                            .size = adev->gmc.real_vram_size,
+                                            .ops = &amdgpu_vram_mgr_dmem_ops,
+                                            .reclaim_priv = man,
+                                        }, "vram");
+       if (IS_ERR(cg))
+               return PTR_ERR(cg);
+
+       mgr->cg_region = cg;
+       ttm_resource_manager_set_dmem_region(man, cg);
 
        ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_VRAM, &mgr->manager);
        ttm_resource_manager_set_used(man, true);
@@ -966,6 +975,16 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
        if (ret)
                return;
 
+       /*
+        * Drain any in-flight dmem cgroup reclaim callbacks and remove the
+        * region from the global list.  This must happen after evict_all()
+        * so that ttm_resource_free() can still uncharge via man->cg while
+        * BOs are being evicted.
+        */
+       dmem_cgroup_unregister_region(mgr->cg_region);
+       mgr->cg_region = NULL;
+       man->cg = NULL;
+
        mutex_lock(&mgr->lock);
        list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks)
                kfree(rsv);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
index 429a21a2e9b2..07103cddb335 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
@@ -36,6 +36,8 @@ struct amdgpu_vram_mgr {
        atomic64_t vis_usage;
        u64 default_page_size;
        struct list_head allocated_vres_list;
+       /** @cg_region: dmem cgroup region for VRAM; unregistered in fini. */
+       struct dmem_cgroup_region *cg_region;
 };
 
 struct amdgpu_vres_task {
-- 
2.54.0

Reply via email to