[Public]
Regards,
Prike
> -----Original Message-----
> From: Alex Deucher <[email protected]>
> Sent: Tuesday, February 3, 2026 5:45 AM
> To: Christian König <[email protected]>
> Cc: [email protected]; Deucher, Alexander <[email protected]>;
> Liang, Prike <[email protected]>; Mohan Marimuthu, Yogesh
> <[email protected]>; SHANMUGAM, SRINIVASAN
> <[email protected]>; Khatri, Sunil <[email protected]>;
> [email protected]
> Subject: Re: [PATCH 1/9] drm/amdgpu: lock both VM and BO in
> amdgpu_gem_object_open
>
> On Mon, Feb 2, 2026 at 7:51 AM Christian König
> <[email protected]> wrote:
> >
> > The VM was not locked in the past since we initially only cleared the
> > linked list element and not added it to any VM state.
> >
> > But this has changed quite some time ago, we just never realized this
> > problem because the VM state lock was masking it.
> >
> > Signed-off-by: Christian König <[email protected]>
>
> Reviewed-by: Alex Deucher <[email protected]>
>
> > ---
> > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 19 +++++++++++-----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 22 ++++++++++++++-----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 +++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++
> > 4 files changed, 42 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > index 768998c82b43..ec5130497743 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > @@ -878,6 +878,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
> struct kgd_mem *mem,
> > struct amdgpu_bo *bo[2] = {NULL, NULL};
> > struct amdgpu_bo_va *bo_va;
> > bool same_hive = false;
> > + struct drm_exec exec;
> > int i, ret;
> >
> > if (!va) {
> > @@ -958,19 +959,25 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
> struct kgd_mem *mem,
> > goto unwind;
> > }
> >
> > - /* Add BO to VM internal data structures */
> > - ret = amdgpu_bo_reserve(bo[i], false);
> > - if (ret) {
> > - pr_debug("Unable to reserve BO during memory
> > attach");
> > - goto unwind;
> > + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
> > + drm_exec_until_all_locked(&exec) {
> > + ret = amdgpu_vm_lock_pd(vm, &exec, 0);
> > + drm_exec_retry_on_contention(&exec);
> > + if (unlikely(ret))
> > + goto unwind;
Here we should handle the error return path properly and destroy the exec
resource in the error handler.
> > + ret = drm_exec_lock_obj(&exec, &bo[i]->tbo.base);
> > + drm_exec_retry_on_contention(&exec);
> > + if (unlikely(ret))
> > + goto unwind;
> > }
> > +
> > bo_va = amdgpu_vm_bo_find(vm, bo[i]);
> > if (!bo_va)
> > bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
> > else
> > ++bo_va->ref_count;
> > attachment[i]->bo_va = bo_va;
> > - amdgpu_bo_unreserve(bo[i]);
> > + drm_exec_fini(&exec);
> > if (unlikely(!attachment[i]->bo_va)) {
> > ret = -ENOMEM;
> > pr_err("Failed to add BO object to VM. ret ==
> > %d\n", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > index 5f9fa2140f09..5c90de58cc28 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > @@ -232,6 +232,7 @@ static int amdgpu_gem_object_open(struct
> drm_gem_object *obj,
> > struct amdgpu_vm *vm = &fpriv->vm;
> > struct amdgpu_bo_va *bo_va;
> > struct mm_struct *mm;
> > + struct drm_exec exec;
> > int r;
> >
> > mm = amdgpu_ttm_tt_get_usermm(abo->tbo.ttm);
> > @@ -242,9 +243,18 @@ static int amdgpu_gem_object_open(struct
> drm_gem_object *obj,
> > !amdgpu_vm_is_bo_always_valid(vm, abo))
> > return -EPERM;
> >
> > - r = amdgpu_bo_reserve(abo, false);
> > - if (r)
> > - return r;
> > + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> > + drm_exec_until_all_locked(&exec) {
> > + r = drm_exec_prepare_obj(&exec, &abo->tbo.base, 1);
> > + drm_exec_retry_on_contention(&exec);
> > + if (unlikely(r))
> > + goto out_unlock;
> > +
> > + r = amdgpu_vm_lock_pd(vm, &exec, 0);
> > + drm_exec_retry_on_contention(&exec);
> > + if (unlikely(r))
> > + goto out_unlock;
> > + }
> >
> > amdgpu_vm_bo_update_shared(abo);
> > bo_va = amdgpu_vm_bo_find(vm, abo); @@ -260,8 +270,7 @@ static
> > int amdgpu_gem_object_open(struct drm_gem_object *obj,
> > amdgpu_bo_unreserve(abo);
> > return r;
> > }
> > -
> > - amdgpu_bo_unreserve(abo);
> > + drm_exec_fini(&exec);
> >
> > /* Validate and add eviction fence to DMABuf imports with dynamic
> > * attachment in compute VMs. Re-validation will be done by @@
> > -294,7 +303,10 @@ static int amdgpu_gem_object_open(struct drm_gem_object
> *obj,
> > }
> > }
> > mutex_unlock(&vm->process_info->lock);
> > + return r;
> >
> > +out_unlock:
> > + drm_exec_fini(&exec);
> > return r;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > index 1878e0faa722..f69332eed051 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > @@ -1445,6 +1445,7 @@ int amdgpu_driver_open_kms(struct drm_device
> > *dev, struct drm_file *file_priv) {
> > struct amdgpu_device *adev = drm_to_adev(dev);
> > struct amdgpu_fpriv *fpriv;
> > + struct drm_exec exec;
> > int r, pasid;
> >
> > /* Ensure IB tests are run on ring */ @@ -1484,7 +1485,16 @@
> > int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file
> > *file_priv)
> > if (r)
> > goto error_pasid;
> >
> > + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> > + drm_exec_until_all_locked(&exec) {
> > + r = amdgpu_vm_lock_pd(&fpriv->vm, &exec, 0);
> > + drm_exec_retry_on_contention(&exec);
> > + if (unlikely(r))
> > + goto error_vm;
> > + }
> > +
Same here
> > fpriv->prt_va = amdgpu_vm_bo_add(adev, &fpriv->vm, NULL);
> > + drm_exec_fini(&exec);
> > if (!fpriv->prt_va) {
> > r = -ENOMEM;
> > goto error_vm;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 6a2ea200d90c..b4bf1b7c214f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -1735,6 +1735,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct
> > amdgpu_device *adev, {
> > struct amdgpu_bo_va *bo_va;
> >
> > + amdgpu_vm_assert_locked(vm);
> > +
> > bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
> > if (bo_va == NULL) {
> > return NULL;
> > --
> > 2.43.0
> >