On 2/6/26 11:21, Khatri, Sunil wrote:
> 
> On 02-02-2026 06:21 pm, Christian König wrote:
>> The VM was not locked in the past since we initially only cleared the
>> linked list element and not added it to any VM state.
>>
>> But this has changed quite some time ago, we just never realized this
>> problem because the VM state lock was masking it.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 +++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       | 22 ++++++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       | 10 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  2 ++
>>   4 files changed, 42 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 768998c82b43..ec5130497743 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -878,6 +878,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
>> struct kgd_mem *mem,
>>       struct amdgpu_bo *bo[2] = {NULL, NULL};
>>       struct amdgpu_bo_va *bo_va;
>>       bool same_hive = false;
>> +    struct drm_exec exec;
>>       int i, ret;
>>         if (!va) {
>> @@ -958,19 +959,25 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
>> struct kgd_mem *mem,
>>               goto unwind;
>>           }
>>   -        /* Add BO to VM internal data structures */
>> -        ret = amdgpu_bo_reserve(bo[i], false);
>> -        if (ret) {
>> -            pr_debug("Unable to reserve BO during memory attach");
>> -            goto unwind;
>> +        drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
>> +        drm_exec_until_all_locked(&exec) {
>> +            ret = amdgpu_vm_lock_pd(vm, &exec, 0);
>> +            drm_exec_retry_on_contention(&exec);
>> +            if (unlikely(ret))
>> +                goto unwind;
>> +            ret = drm_exec_lock_obj(&exec, &bo[i]->tbo.base);
>> +            drm_exec_retry_on_contention(&exec);
>> +            if (unlikely(ret))
>> +                goto unwind;
>>           }
>> +
>>           bo_va = amdgpu_vm_bo_find(vm, bo[i]);
>>           if (!bo_va)
>>               bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
>>           else
>>               ++bo_va->ref_count;
>>           attachment[i]->bo_va = bo_va;
>> -        amdgpu_bo_unreserve(bo[i]);
>> +        drm_exec_fini(&exec);
>>           if (unlikely(!attachment[i]->bo_va)) {
>>               ret = -ENOMEM;
>>               pr_err("Failed to add BO object to VM. ret == %d\n",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 5f9fa2140f09..5c90de58cc28 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -232,6 +232,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
>> *obj,
>>       struct amdgpu_vm *vm = &fpriv->vm;
>>       struct amdgpu_bo_va *bo_va;
>>       struct mm_struct *mm;
>> +    struct drm_exec exec;
>>       int r;
>>         mm = amdgpu_ttm_tt_get_usermm(abo->tbo.ttm);
>> @@ -242,9 +243,18 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
>> *obj,
>>           !amdgpu_vm_is_bo_always_valid(vm, abo))
>>           return -EPERM;
>>   -    r = amdgpu_bo_reserve(abo, false);
>> -    if (r)
>> -        return r;
>> +    drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> +    drm_exec_until_all_locked(&exec) {
>> +        r = drm_exec_prepare_obj(&exec, &abo->tbo.base, 1);
>> +        drm_exec_retry_on_contention(&exec);
>> +        if (unlikely(r))
>> +            goto out_unlock;
>> +
>> +        r = amdgpu_vm_lock_pd(vm, &exec, 0);
>> +        drm_exec_retry_on_contention(&exec);
>> +        if (unlikely(r))
>> +            goto out_unlock;
>> +    }
>>         amdgpu_vm_bo_update_shared(abo);
>>       bo_va = amdgpu_vm_bo_find(vm, abo);
>> @@ -260,8 +270,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
>> *obj,
>>           amdgpu_bo_unreserve(abo);
>>           return r;
>>       }
>> -
>> -    amdgpu_bo_unreserve(abo);
>> +    drm_exec_fini(&exec);
>>         /* Validate and add eviction fence to DMABuf imports with dynamic
>>        * attachment in compute VMs. Re-validation will be done by
>> @@ -294,7 +303,10 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
>> *obj,
>>           }
>>       }
>>       mutex_unlock(&vm->process_info->lock);
>> +    return r;
>>   +out_unlock:
>> +    drm_exec_fini(&exec);
>>       return r;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 1878e0faa722..f69332eed051 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1445,6 +1445,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, 
>> struct drm_file *file_priv)
>>   {
>>       struct amdgpu_device *adev = drm_to_adev(dev);
>>       struct amdgpu_fpriv *fpriv;
>> +    struct drm_exec exec;
>>       int r, pasid;
>>         /* Ensure IB tests are run on ring */
>> @@ -1484,7 +1485,16 @@ int amdgpu_driver_open_kms(struct drm_device *dev, 
>> struct drm_file *file_priv)
>>       if (r)
>>           goto error_pasid;
>>   +    drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> +    drm_exec_until_all_locked(&exec) {
>> +        r = amdgpu_vm_lock_pd(&fpriv->vm, &exec, 0);
> Do we really need to do this here? We are in the process of creating a new VM 
> and new PD so at this time, no one would be able to use the VM and PD till it 
> returns from the function.
> 
> Looking at it with that understanding it feels we should be fine without 
> locking the pd here

The PD is already on TTM BO eviction list at that point and might be locked 
concurrently.

So yes, we absolutely need that.

Regards,
Christian.

> 
> Regards
> Sunil Khatri
> 
>> +        drm_exec_retry_on_contention(&exec);
>> +        if (unlikely(r))
>> +            goto error_vm;
>> +    }
>> +
>>       fpriv->prt_va = amdgpu_vm_bo_add(adev, &fpriv->vm, NULL);
>> +    drm_exec_fini(&exec);
>>       if (!fpriv->prt_va) {
>>           r = -ENOMEM;
>>           goto error_vm;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 6a2ea200d90c..b4bf1b7c214f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1735,6 +1735,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_bo_va *bo_va;
>>   +    amdgpu_vm_assert_locked(vm);
>> +
>>       bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
>>       if (bo_va == NULL) {
>>           return NULL;

Reply via email to