User queue destroy removed the tracked queue VA mappings before removing
the HW queue from MES. If the queue still had active waves, HW like as TCP
could continue accessing queue backing memory after the VM mappings were
removed, resulting in gfxhub page faults.

So, that needs to move queue VA cleanup after HW queue unmap. Meanwhile,
if MES fails to remove the queue, then need to run reset recovery before
freeing queue resources.

Signed-off-by: Prike Liang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 43 +++++++++++++++++------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 83aee0810513..2e3edb6dd506 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -625,8 +625,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
struct amdgpu_usermode_que
        struct amdgpu_device *adev = uq_mgr->adev;
        struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
        struct amdgpu_vm *vm = &fpriv->vm;
-
-       int r = 0;
+       int r = 0, tmp;
 
        trace_amdgpu_userq_destroy_start(queue);
 
@@ -635,15 +634,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
struct amdgpu_usermode_que
        /* Cancel any pending hang detection work and cleanup */
        cancel_delayed_work_sync(&queue->hang_detect_work);
 
-       r = amdgpu_bo_reserve(vm->root.bo, false);
-       if (r) {
-               drm_file_err(uq_mgr->file, "Failed to reserve root bo during 
userqueue destroy\n");
-               trace_amdgpu_userq_destroy_end(queue, r);
-               return r;
-       }
-       amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
-       amdgpu_bo_unreserve(vm->root.bo);
-
        mutex_lock(&uq_mgr->userq_mutex);
        amdgpu_userq_wait_for_last_fence(queue);
 
@@ -651,6 +641,37 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
struct amdgpu_usermode_que
        debugfs_remove_recursive(queue->debugfs_queue);
 #endif
        r = amdgpu_userq_unmap_helper(queue);
+
+       if (r) {
+               drm_file_err(uq_mgr->file,
+                            "Failed to unmap userqueue during destroy, 
ret=%d\n",
+                            r);
+               amdgpu_userq_fence_driver_force_completion(queue);
+               amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
+                                            &uq_mgr->reset_work);
+               flush_work(&uq_mgr->reset_work);
+       }
+       mutex_unlock(&uq_mgr->userq_mutex);
+       /*
+        * Drop the queue VA mappings only after the HW queue is removed (or
+        * reset recovery has run). Removing the mappings first lets active TCP
+        * waves fault on queue backing memory while MES is still trying to
+        * process REMOVE_QUEUE.
+        */
+       tmp = amdgpu_bo_reserve(vm->root.bo, false);
+       if (tmp) {
+               drm_file_err(uq_mgr->file,
+                            "Failed to reserve root bo during userqueue 
destroy\n");
+               if (!r)
+                       r = tmp;
+       } else {
+               tmp = amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
+               amdgpu_bo_unreserve(vm->root.bo);
+               if (tmp && !r)
+                       r = tmp;
+       }
+
+       mutex_lock(&uq_mgr->userq_mutex);
        atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
        amdgpu_userq_cleanup(queue);
        mutex_unlock(&uq_mgr->userq_mutex);
-- 
2.34.1

Reply via email to