That was actually complete nonsense and not validating the BOs at all. The code just cleared all VM areas were it couldn't grab the lock for a BO.
Try to fix this. Only compile tested at the moment. v2: fix fence slot reservation as well as pointed out by Sunil. also validate PDs, PTs, per VM BOs and update PDEs v3: grab the status_lock while working with the done list. Signed-off-by: Christian König <christian.koe...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 136 ++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 35 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 + 3 files changed, 97 insertions(+), 76 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 424831997cb1..abc2f96bea76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -545,108 +545,92 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) return ret; } -static int -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo) { struct ttm_operation_ctx ctx = { false, false }; - int ret; amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); - - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); - if (ret) - DRM_ERROR("Fail to validate\n"); - - return ret; + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); } static int -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) +amdgpu_userq_validate_bos(struct amdgpu_device *adev, struct drm_exec *exec, + struct amdgpu_vm *vm) { - struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); - struct amdgpu_vm *vm = &fpriv->vm; - struct amdgpu_device *adev = uq_mgr->adev; + struct ttm_operation_ctx ctx = { false, false }; struct amdgpu_bo_va *bo_va; - struct ww_acquire_ctx *ticket; - struct drm_exec exec; struct amdgpu_bo *bo; - struct dma_resv *resv; - bool clear, unlock; - int ret = 0; - - drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); - drm_exec_until_all_locked(&exec) { - ret = amdgpu_vm_lock_pd(vm, &exec, 2); - drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) { - drm_file_err(uq_mgr->file, "Failed to lock PD\n"); - goto unlock_all; - } - - /* Lock the done list */ - list_for_each_entry(bo_va, &vm->done, base.vm_status) { - bo = bo_va->base.bo; - if (!bo) - continue; - - ret = drm_exec_lock_obj(&exec, &bo->tbo.base); - drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) - goto unlock_all; - } - } + int ret; spin_lock(&vm->status_lock); - while (!list_empty(&vm->moved)) { - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, + while (!list_empty(&vm->invalidated)) { + bo_va = list_first_entry(&vm->invalidated, + struct amdgpu_bo_va, base.vm_status); spin_unlock(&vm->status_lock); - /* Per VM BOs never need to bo cleared in the page tables */ + bo = bo_va->base.bo; + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); + if (unlikely(ret)) + return ret; + + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (ret) + return ret; + + /* This moves the bo_va to the done list */ ret = amdgpu_vm_bo_update(adev, bo_va, false); if (ret) - goto unlock_all; + return ret; + spin_lock(&vm->status_lock); } + spin_unlock(&vm->status_lock); - ticket = &exec.ticket; - while (!list_empty(&vm->invalidated)) { - bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, - base.vm_status); - resv = bo_va->base.bo->tbo.base.resv; - spin_unlock(&vm->status_lock); + return 0; +} - bo = bo_va->base.bo; - ret = amdgpu_userq_validate_vm_bo(NULL, bo); - if (ret) { - drm_file_err(uq_mgr->file, "Failed to validate BO\n"); - goto unlock_all; - } +static int +amdgpu_userq_update_vm(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_vm *vm = &fpriv->vm; + struct drm_exec exec; + int ret; - /* Try to reserve the BO to avoid clearing its ptes */ - if (!adev->debug_vm && dma_resv_trylock(resv)) { - clear = false; - unlock = true; - /* The caller is already holding the reservation lock */ - } else if (dma_resv_locking_ctx(resv) == ticket) { - clear = false; - unlock = false; - /* Somebody else is using the BO right now */ - } else { - clear = true; - unlock = false; - } + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); + drm_exec_until_all_locked(&exec) { + ret = amdgpu_vm_lock_pd(vm, &exec, 1); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; - ret = amdgpu_vm_bo_update(adev, bo_va, clear); + ret = amdgpu_vm_lock_done(vm, &exec, 1); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; - if (unlock) - dma_resv_unlock(resv); - if (ret) + ret = amdgpu_vm_validate(adev, vm, NULL, + amdgpu_userq_validate_vm, + NULL); + if (unlikely(ret)) goto unlock_all; - spin_lock(&vm->status_lock); + ret = amdgpu_userq_validate_bos(adev, &exec, vm); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; } - spin_unlock(&vm->status_lock); + + ret = amdgpu_vm_handle_moved(adev, vm, NULL); + if (ret) + goto unlock_all; + + ret = amdgpu_vm_update_pdes(adev, vm, false); + if (ret) + goto unlock_all; ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); if (ret) @@ -667,7 +651,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) mutex_lock(&uq_mgr->userq_mutex); - ret = amdgpu_userq_validate_bos(uq_mgr); + ret = amdgpu_userq_update_vm(uq_mgr); if (ret) { drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n"); goto unlock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index bf42246a3db2..16451c9bbe1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, 2 + num_fences); } +/** + * amdgpu_vm_lock_done - lock all BOs on the done list + * @exec: drm execution context + * @num_fences: number of extra fences to reserve + * + * Lock the BOs on the done list in the DRM execution context. + */ +int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec, + unsigned int num_fences) +{ + struct list_head *prev = &vm->done; + struct amdgpu_bo_va *bo_va; + struct amdgpu_bo *bo; + int ret; + + /* We can only trust prev->next while holding the lock */ + spin_lock(&vm->status_lock); + while (!list_is_head(prev->next, &vm->done)) { + bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); + spin_unlock(&vm->status_lock); + + bo = bo_va->base.bo; + if (bo) { + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1); + if (unlikely(ret)) + return ret; + } + spin_lock(&vm->status_lock); + prev = prev->next; + } + spin_unlock(&vm->status_lock); + + return 0; +} + /** * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index f9549f6b3d1f..0e3884dfdb6d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -492,6 +492,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, unsigned int num_fences); +int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec, + unsigned int num_fences); bool amdgpu_vm_ready(struct amdgpu_vm *vm); uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, -- 2.43.0