On 5/21/26 12:43, Mikhail Gavrilov wrote: > amdgpu_vm_lock_by_pasid() looks up a VM by PASID and reserves its root > PD with a bare amdgpu_bo_reserve(), returning the still-reserved root to > the caller. A caller that then needs to reserve further BOs (for example > the devcoredump IB dump) ends up nesting reservation_ww_class_mutex > acquires without a ww_acquire_ctx, which lockdep flags as recursive > locking. > > Convert the helper to take a drm_exec context and lock the root PD with > drm_exec_lock_obj(). Callers now run it inside a > drm_exec_until_all_locked() loop and can lock additional BOs in the same > ww ticket, so there is no nested ww_mutex acquire. > > The drm_exec context holds its own reference on the locked root BO, so > the helper no longer hands a root reference back to the caller: the > root output parameter is dropped, and the transient reference taken > across the PASID lookup is released before returning. > > The only existing caller, amdgpu_vm_handle_fault(), is updated > accordingly. Its is_compute_context path, which previously dropped the > root reservation around svm_range_restore_pages() and re-took it, now > finalises the drm_exec context and re-initialises a fresh one; behaviour > is otherwise unchanged. > > No functional change intended for the page-fault path. > > Signed-off-by: Mikhail Gavrilov <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 91 ++++++++++++++++---------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- > 2 files changed, 58 insertions(+), 35 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 9ba9de16a27a..591980907211 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2950,47 +2950,56 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > } > > /** > - * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a > pasid, if possible. > + * amdgpu_vm_lock_by_pasid - look up a VM by PASID and lock its root PD > * @adev: amdgpu device pointer > - * @root: root BO of the VM > * @pasid: PASID of the VM > - * The caller needs to unreserve and unref the root bo on success. > + * @exec: drm_exec context to lock the root PD in > + * > + * Must be called from within a drm_exec_until_all_locked() loop; the caller > + * runs drm_exec_retry_on_contention() afterwards. The drm_exec context holds > + * a reference on the root BO until it is finalised. > + * > + * Return: the VM on success, or NULL if the PASID has no VM, the VM is being > + * torn down, or locking the root PD failed. > */ > struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, > - struct amdgpu_bo **root, u32 pasid) > + u32 pasid, struct drm_exec *exec) > { > unsigned long irqflags; > + struct amdgpu_bo *root; > struct amdgpu_vm *vm; > int r; > > xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > vm = xa_load(&adev->vm_manager.pasids, pasid); > - *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; > + root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; > xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > > - if (!*root) > + if (!root) > return NULL; > > - r = amdgpu_bo_reserve(*root, true); > - if (r) > - goto error_unref; > + r = drm_exec_lock_obj(exec, &root->tbo.base); > + if (r) { > + amdgpu_bo_unref(&root); > + return NULL; > + } > > /* Double check that the VM still exists */ > xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > vm = xa_load(&adev->vm_manager.pasids, pasid); > - if (vm && vm->root.bo != *root) > + if (vm && vm->root.bo != root) > vm = NULL; > xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > - if (!vm) > - goto error_unlock; > + if (!vm) { > + drm_exec_unlock_obj(exec, &root->tbo.base); > + amdgpu_bo_unref(&root); > + return NULL; > + } > > - return vm; > -error_unlock: > - amdgpu_bo_unreserve(*root); > + /* The drm_exec context holds its own reference on the root BO. */ > + amdgpu_bo_unref(&root); > > -error_unref: > - amdgpu_bo_unref(root); > - return NULL; > + return vm; > } > > /** > @@ -3012,33 +3021,49 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device > *adev, u32 pasid, > uint64_t ts, bool write_fault) > { > bool is_compute_context = false; > - struct amdgpu_bo *root; > + struct drm_exec exec; > uint64_t value, flags; > struct amdgpu_vm *vm; > int r; > > - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); > - if (!vm) > + drm_exec_init(&exec, 0, 0);
Make the last parameter 1 here since we are expecting to lock 1 object. Not a must have, it will work without but it is just a little bit more optimal. Apart from that Reviewed-by: Christian König <[email protected]>. Thanks, Christian. > + drm_exec_until_all_locked(&exec) { > + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); > + drm_exec_retry_on_contention(&exec); > + if (!vm) > + break; > + } > + if (!vm) { > + drm_exec_fini(&exec); > return false; > + } > > is_compute_context = vm->is_compute_context; > > if (is_compute_context) { > - /* Unreserve root since svm_range_restore_pages might try to > reserve it. */ > - /* TODO: rework svm_range_restore_pages so that this isn't > necessary. */ > - amdgpu_bo_unreserve(root); > + /* Release the root PD lock since svm_range_restore_pages > + * might try to take it. > + * TODO: rework svm_range_restore_pages so that this isn't > + * necessary. > + */ > + drm_exec_fini(&exec); > > if (!svm_range_restore_pages(adev, pasid, vmid, > - node_id, addr >> PAGE_SHIFT, ts, > write_fault)) { > - amdgpu_bo_unref(&root); > + node_id, addr >> PAGE_SHIFT, ts, > write_fault)) > return true; > - } > - amdgpu_bo_unref(&root); > > /* Re-acquire the VM lock, could be that the VM was freed in > between. */ > - vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); > - if (!vm) > + drm_exec_init(&exec, 0, 0); > + drm_exec_until_all_locked(&exec) { > + vm = amdgpu_vm_lock_by_pasid(adev, pasid, &exec); > + drm_exec_retry_on_contention(&exec); > + if (!vm) > + break; > + } > + if (!vm) { > + drm_exec_fini(&exec); > return false; > + } > } > > addr /= AMDGPU_GPU_PAGE_SIZE; > @@ -3062,7 +3087,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, > u32 pasid, > value = 0; > } > > - r = dma_resv_reserve_fences(root->tbo.base.resv, 1); > + r = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); > if (r) { > pr_debug("failed %d to reserve fence slot\n", r); > goto error_unlock; > @@ -3076,12 +3101,10 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device > *adev, u32 pasid, > r = amdgpu_vm_update_pdes(adev, vm, true); > > error_unlock: > - amdgpu_bo_unreserve(root); > + drm_exec_fini(&exec); > if (r < 0) > dev_err(adev->dev, "Can't handle page fault (%d)\n", r); > > - amdgpu_bo_unref(&root); > - > return false; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index d083d7aab75c..0c6e3e0368c7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -593,7 +593,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, > u32 pasid, > bool write_fault); > > struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, > - struct amdgpu_bo **root, u32 pasid); > + u32 pasid, struct drm_exec *exec); > > void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); >
