[Public]

Regards,
      Prike

> -----Original Message-----
> From: Alex Deucher <[email protected]>
> Sent: Tuesday, February 3, 2026 5:45 AM
> To: Christian König <[email protected]>
> Cc: [email protected]; Deucher, Alexander <[email protected]>;
> Liang, Prike <[email protected]>; Mohan Marimuthu, Yogesh
> <[email protected]>; SHANMUGAM, SRINIVASAN
> <[email protected]>; Khatri, Sunil <[email protected]>;
> [email protected]
> Subject: Re: [PATCH 1/9] drm/amdgpu: lock both VM and BO in
> amdgpu_gem_object_open
>
> On Mon, Feb 2, 2026 at 7:51 AM Christian König
> <[email protected]> wrote:
> >
> > The VM was not locked in the past since we initially only cleared the
> > linked list element and not added it to any VM state.
> >
> > But this has changed quite some time ago, we just never realized this
> > problem because the VM state lock was masking it.
> >
> > Signed-off-by: Christian König <[email protected]>
>
> Reviewed-by: Alex Deucher <[email protected]>
>
> > ---
> >  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 +++++++++++-----
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       | 22 ++++++++++++++-----
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       | 10 +++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  2 ++
> >  4 files changed, 42 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > index 768998c82b43..ec5130497743 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > @@ -878,6 +878,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
> struct kgd_mem *mem,
> >         struct amdgpu_bo *bo[2] = {NULL, NULL};
> >         struct amdgpu_bo_va *bo_va;
> >         bool same_hive = false;
> > +       struct drm_exec exec;
> >         int i, ret;
> >
> >         if (!va) {
> > @@ -958,19 +959,25 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
> struct kgd_mem *mem,
> >                         goto unwind;
> >                 }
> >
> > -               /* Add BO to VM internal data structures */
> > -               ret = amdgpu_bo_reserve(bo[i], false);
> > -               if (ret) {
> > -                       pr_debug("Unable to reserve BO during memory 
> > attach");
> > -                       goto unwind;
> > +               drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
> > +               drm_exec_until_all_locked(&exec) {
> > +                       ret = amdgpu_vm_lock_pd(vm, &exec, 0);
> > +                       drm_exec_retry_on_contention(&exec);
> > +                       if (unlikely(ret))
> > +                               goto unwind;
Here we should handle the error return path properly and destroy the exec 
resource in the error handler.

> > +                       ret = drm_exec_lock_obj(&exec, &bo[i]->tbo.base);
> > +                       drm_exec_retry_on_contention(&exec);
> > +                       if (unlikely(ret))
> > +                               goto unwind;
> >                 }
> > +
> >                 bo_va = amdgpu_vm_bo_find(vm, bo[i]);
> >                 if (!bo_va)
> >                         bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
> >                 else
> >                         ++bo_va->ref_count;
> >                 attachment[i]->bo_va = bo_va;
> > -               amdgpu_bo_unreserve(bo[i]);
> > +               drm_exec_fini(&exec);
> >                 if (unlikely(!attachment[i]->bo_va)) {
> >                         ret = -ENOMEM;
> >                         pr_err("Failed to add BO object to VM. ret ==
> > %d\n", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > index 5f9fa2140f09..5c90de58cc28 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > @@ -232,6 +232,7 @@ static int amdgpu_gem_object_open(struct
> drm_gem_object *obj,
> >         struct amdgpu_vm *vm = &fpriv->vm;
> >         struct amdgpu_bo_va *bo_va;
> >         struct mm_struct *mm;
> > +       struct drm_exec exec;
> >         int r;
> >
> >         mm = amdgpu_ttm_tt_get_usermm(abo->tbo.ttm);
> > @@ -242,9 +243,18 @@ static int amdgpu_gem_object_open(struct
> drm_gem_object *obj,
> >             !amdgpu_vm_is_bo_always_valid(vm, abo))
> >                 return -EPERM;
> >
> > -       r = amdgpu_bo_reserve(abo, false);
> > -       if (r)
> > -               return r;
> > +       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> > +       drm_exec_until_all_locked(&exec) {
> > +               r = drm_exec_prepare_obj(&exec, &abo->tbo.base, 1);
> > +               drm_exec_retry_on_contention(&exec);
> > +               if (unlikely(r))
> > +                       goto out_unlock;
> > +
> > +               r = amdgpu_vm_lock_pd(vm, &exec, 0);
> > +               drm_exec_retry_on_contention(&exec);
> > +               if (unlikely(r))
> > +                       goto out_unlock;
> > +       }
> >
> >         amdgpu_vm_bo_update_shared(abo);
> >         bo_va = amdgpu_vm_bo_find(vm, abo); @@ -260,8 +270,7 @@ static
> > int amdgpu_gem_object_open(struct drm_gem_object *obj,
> >                 amdgpu_bo_unreserve(abo);
> >                 return r;
> >         }
> > -
> > -       amdgpu_bo_unreserve(abo);
> > +       drm_exec_fini(&exec);
> >
> >         /* Validate and add eviction fence to DMABuf imports with dynamic
> >          * attachment in compute VMs. Re-validation will be done by @@
> > -294,7 +303,10 @@ static int amdgpu_gem_object_open(struct drm_gem_object
> *obj,
> >                 }
> >         }
> >         mutex_unlock(&vm->process_info->lock);
> > +       return r;
> >
> > +out_unlock:
> > +       drm_exec_fini(&exec);
> >         return r;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > index 1878e0faa722..f69332eed051 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > @@ -1445,6 +1445,7 @@ int amdgpu_driver_open_kms(struct drm_device
> > *dev, struct drm_file *file_priv)  {
> >         struct amdgpu_device *adev = drm_to_adev(dev);
> >         struct amdgpu_fpriv *fpriv;
> > +       struct drm_exec exec;
> >         int r, pasid;
> >
> >         /* Ensure IB tests are run on ring */ @@ -1484,7 +1485,16 @@
> > int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file 
> > *file_priv)
> >         if (r)
> >                 goto error_pasid;
> >
> > +       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> > +       drm_exec_until_all_locked(&exec) {
> > +               r = amdgpu_vm_lock_pd(&fpriv->vm, &exec, 0);
> > +               drm_exec_retry_on_contention(&exec);
> > +               if (unlikely(r))
> > +                       goto error_vm;
> > +       }
> > +
Same here
> >         fpriv->prt_va = amdgpu_vm_bo_add(adev, &fpriv->vm, NULL);
> > +       drm_exec_fini(&exec);
> >         if (!fpriv->prt_va) {
> >                 r = -ENOMEM;
> >                 goto error_vm;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 6a2ea200d90c..b4bf1b7c214f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -1735,6 +1735,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct
> > amdgpu_device *adev,  {
> >         struct amdgpu_bo_va *bo_va;
> >
> > +       amdgpu_vm_assert_locked(vm);
> > +
> >         bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
> >         if (bo_va == NULL) {
> >                 return NULL;
> > --
> > 2.43.0
> >

Reply via email to