On 8/12/25 1:31 AM, Liu01 Tong wrote:
> During process kill, drm_sched_entity_flush() will kill the vm
> entities. The following job submissions of this process will fail, and
> the resources of these jobs have not been released, nor have the fences
> been signalled, causing tasks to hang and timeout.
> 
> Fix by check entity status in amdgpu_vm_ready() and avoid submit jobs to
> stopped entity.
> 
> v2: add amdgpu_vm_ready() check before amdgpu_vm_clear_freed() in
> function amdgpu_cs_vm_handling().
> 
> Signed-off-by: Liu01 Tong <tong.li...@amd.com>
> Signed-off-by: Lin.Cao <linca...@amd.com>

Closes: 
https://lore.kernel.org/regressions/f2b70e6e-bff6-42f3-82a2-81eed892c...@linux.dev/
Tested-by: Matthew Schwartz <matthew.schwa...@linux.dev>

Thanks,
Matt

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  3 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 +++++++++++----
>  2 files changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index e1e48e6f1f35..cdc02860011c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1138,6 +1138,9 @@ static int amdgpu_cs_vm_handling(struct 
> amdgpu_cs_parser *p)
>               }
>       }
>  
> +     if (!amdgpu_vm_ready(vm))
> +             return -EINVAL;
> +
>       r = amdgpu_vm_clear_freed(adev, vm, NULL);
>       if (r)
>               return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 283dd44f04b0..bf42246a3db2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm,
>   * Check if all VM PDs/PTs are ready for updates
>   *
>   * Returns:
> - * True if VM is not evicting.
> + * True if VM is not evicting and all VM entities are not stopped
>   */
>  bool amdgpu_vm_ready(struct amdgpu_vm *vm)
>  {
> -     bool empty;
>       bool ret;
>  
>       amdgpu_vm_eviction_lock(vm);
> @@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
>       amdgpu_vm_eviction_unlock(vm);
>  
>       spin_lock(&vm->status_lock);
> -     empty = list_empty(&vm->evicted);
> +     ret &= list_empty(&vm->evicted);
>       spin_unlock(&vm->status_lock);
>  
> -     return ret && empty;
> +     spin_lock(&vm->immediate.lock);
> +     ret &= !vm->immediate.stopped;
> +     spin_unlock(&vm->immediate.lock);
> +
> +     spin_lock(&vm->delayed.lock);
> +     ret &= !vm->delayed.stopped;
> +     spin_unlock(&vm->delayed.lock);
> +
> +     return ret;
>  }
>  
>  /**

Reply via email to