On 2/6/26 11:21, Khatri, Sunil wrote:
>
> On 02-02-2026 06:21 pm, Christian König wrote:
>> The VM was not locked in the past since we initially only cleared the
>> linked list element and not added it to any VM state.
>>
>> But this has changed quite some time ago, we just never realized this
>> problem because the VM state lock was masking it.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 19 +++++++++++-----
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 22 ++++++++++++++-----
>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 +++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++
>> 4 files changed, 42 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 768998c82b43..ec5130497743 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -878,6 +878,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
>> struct kgd_mem *mem,
>> struct amdgpu_bo *bo[2] = {NULL, NULL};
>> struct amdgpu_bo_va *bo_va;
>> bool same_hive = false;
>> + struct drm_exec exec;
>> int i, ret;
>> if (!va) {
>> @@ -958,19 +959,25 @@ static int kfd_mem_attach(struct amdgpu_device *adev,
>> struct kgd_mem *mem,
>> goto unwind;
>> }
>> - /* Add BO to VM internal data structures */
>> - ret = amdgpu_bo_reserve(bo[i], false);
>> - if (ret) {
>> - pr_debug("Unable to reserve BO during memory attach");
>> - goto unwind;
>> + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
>> + drm_exec_until_all_locked(&exec) {
>> + ret = amdgpu_vm_lock_pd(vm, &exec, 0);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(ret))
>> + goto unwind;
>> + ret = drm_exec_lock_obj(&exec, &bo[i]->tbo.base);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(ret))
>> + goto unwind;
>> }
>> +
>> bo_va = amdgpu_vm_bo_find(vm, bo[i]);
>> if (!bo_va)
>> bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
>> else
>> ++bo_va->ref_count;
>> attachment[i]->bo_va = bo_va;
>> - amdgpu_bo_unreserve(bo[i]);
>> + drm_exec_fini(&exec);
>> if (unlikely(!attachment[i]->bo_va)) {
>> ret = -ENOMEM;
>> pr_err("Failed to add BO object to VM. ret == %d\n",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 5f9fa2140f09..5c90de58cc28 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -232,6 +232,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object
>> *obj,
>> struct amdgpu_vm *vm = &fpriv->vm;
>> struct amdgpu_bo_va *bo_va;
>> struct mm_struct *mm;
>> + struct drm_exec exec;
>> int r;
>> mm = amdgpu_ttm_tt_get_usermm(abo->tbo.ttm);
>> @@ -242,9 +243,18 @@ static int amdgpu_gem_object_open(struct drm_gem_object
>> *obj,
>> !amdgpu_vm_is_bo_always_valid(vm, abo))
>> return -EPERM;
>> - r = amdgpu_bo_reserve(abo, false);
>> - if (r)
>> - return r;
>> + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> + drm_exec_until_all_locked(&exec) {
>> + r = drm_exec_prepare_obj(&exec, &abo->tbo.base, 1);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(r))
>> + goto out_unlock;
>> +
>> + r = amdgpu_vm_lock_pd(vm, &exec, 0);
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(r))
>> + goto out_unlock;
>> + }
>> amdgpu_vm_bo_update_shared(abo);
>> bo_va = amdgpu_vm_bo_find(vm, abo);
>> @@ -260,8 +270,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object
>> *obj,
>> amdgpu_bo_unreserve(abo);
>> return r;
>> }
>> -
>> - amdgpu_bo_unreserve(abo);
>> + drm_exec_fini(&exec);
>> /* Validate and add eviction fence to DMABuf imports with dynamic
>> * attachment in compute VMs. Re-validation will be done by
>> @@ -294,7 +303,10 @@ static int amdgpu_gem_object_open(struct drm_gem_object
>> *obj,
>> }
>> }
>> mutex_unlock(&vm->process_info->lock);
>> + return r;
>> +out_unlock:
>> + drm_exec_fini(&exec);
>> return r;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 1878e0faa722..f69332eed051 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1445,6 +1445,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev,
>> struct drm_file *file_priv)
>> {
>> struct amdgpu_device *adev = drm_to_adev(dev);
>> struct amdgpu_fpriv *fpriv;
>> + struct drm_exec exec;
>> int r, pasid;
>> /* Ensure IB tests are run on ring */
>> @@ -1484,7 +1485,16 @@ int amdgpu_driver_open_kms(struct drm_device *dev,
>> struct drm_file *file_priv)
>> if (r)
>> goto error_pasid;
>> + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> + drm_exec_until_all_locked(&exec) {
>> + r = amdgpu_vm_lock_pd(&fpriv->vm, &exec, 0);
> Do we really need to do this here? We are in the process of creating a new VM
> and new PD so at this time, no one would be able to use the VM and PD till it
> returns from the function.
>
> Looking at it with that understanding it feels we should be fine without
> locking the pd here
The PD is already on TTM BO eviction list at that point and might be locked
concurrently.
So yes, we absolutely need that.
Regards,
Christian.
>
> Regards
> Sunil Khatri
>
>> + drm_exec_retry_on_contention(&exec);
>> + if (unlikely(r))
>> + goto error_vm;
>> + }
>> +
>> fpriv->prt_va = amdgpu_vm_bo_add(adev, &fpriv->vm, NULL);
>> + drm_exec_fini(&exec);
>> if (!fpriv->prt_va) {
>> r = -ENOMEM;
>> goto error_vm;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 6a2ea200d90c..b4bf1b7c214f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1735,6 +1735,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct
>> amdgpu_device *adev,
>> {
>> struct amdgpu_bo_va *bo_va;
>> + amdgpu_vm_assert_locked(vm);
>> +
>> bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>> if (bo_va == NULL) {
>> return NULL;