Re: [PATCH] drm/amdgpu: Don't propagate errors from amdgpu_vm_flush()

Christian König Mon, 09 Feb 2026 01:41:37 -0800

On 2/8/26 07:27, Srinivasan Shanmugam wrote:
> amdgpu_vm_flush() only returns an error when amdgpu_fence_emit() fails.


You need to take a step further amdgpu_fence_emit() can't fail either.

The fallback wait inside that function blocks forever until the fence signals 
and that should never happen in the job submission path in the first place.

Regards,
Christian.

> That failure is not expected and the caller should not have to handle
> it.
> 
> Handle amdgpu_fence_emit() failure inside amdgpu_vm_flush() by undoing
> the ring and return 0. Drop the return value handling in
> amdgpu_ib_schedule().
> 
> Cc: Alex Deucher <[email protected]>
> Suggested-by: Christian König <[email protected]>
> Signed-off-by: Srinivasan Shanmugam <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c |  9 ++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 25 +++++++++++++++----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>  3 files changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 44f230d67da2..e763b2c1a386 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -225,13 +225,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, 
> unsigned int num_ibs,
>       if (ring->funcs->insert_start)
>               ring->funcs->insert_start(ring);
>  
> -     if (job) {
> -             r = amdgpu_vm_flush(ring, job, need_pipe_sync);
> -             if (r) {
> -                     amdgpu_ring_undo(ring);
> -                     return r;
> -             }
> -     }
> +     if (job)
> +             amdgpu_vm_flush(ring, job, need_pipe_sync);
>  
>       amdgpu_ring_ib_begin(ring);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 807f8bcc7de5..2698a3bf7970 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -764,12 +764,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring 
> *ring,
>   * @need_pipe_sync: is pipe sync needed
>   *
>   * Emit a VM flush when it is necessary.
> - *
> - * Returns:
> - * 0 on success, errno otherwise.
>   */
> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> -                 bool need_pipe_sync)
> +void amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> +                  bool need_pipe_sync)
>  {
>       struct amdgpu_device *adev = ring->adev;
>       struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
> @@ -783,7 +780,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
> amdgpu_job *job,
>       bool cleaner_shader_needed = false;
>       bool pasid_mapping_needed = false;
>       struct dma_fence *fence = NULL;
> -     unsigned int patch;
> +     unsigned int patch = 0;
>       int r;
>  
>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
> @@ -812,7 +809,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
> amdgpu_job *job,
>  
>       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync &&
>           !cleaner_shader_needed)
> -             return 0;
> +             return;
>  
>       amdgpu_ring_ib_begin(ring);
>       if (ring->funcs->init_cond_exec)
> @@ -846,8 +843,17 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
> amdgpu_job *job,
>  
>       if (vm_flush_needed || pasid_mapping_needed || cleaner_shader_needed) {
>               r = amdgpu_fence_emit(ring, job->hw_vm_fence, 0);
> -             if (r)
> -                     return r;
> +             if (WARN_ON_ONCE(r)) {
> +                     /*
> +                      * amdgpu_vm_flush() is not expected to fail.
> +                      * The only possible error originates from 
> amdgpu_fence_emit().
> +                      * If it ever fails, undo ring writes and don't 
> propagate
> +                      * an error to callers.
> +                      */
> +                     dev_err(adev->dev, "failed to emit VM fence (%d)\n", r);
> +                     amdgpu_ring_undo(ring);
> +                     return;
> +             }
>               fence = &job->hw_vm_fence->base;
>               /* get a ref for the job */
>               dma_fence_get(fence);
> @@ -892,7 +898,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
> amdgpu_job *job,
>       }
>  
>       amdgpu_ring_ib_end(ring);
> -     return 0;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 139642eacdd0..322193b97d3c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -512,7 +512,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>                      struct ww_acquire_ctx *ticket,
>                      int (*callback)(void *p, struct amdgpu_bo *bo),
>                      void *param);
> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool 
> need_pipe_sync);
> +void amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool 
> need_pipe_sync);
>  int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
>                         struct amdgpu_vm *vm, bool immediate);
>  int amdgpu_vm_clear_freed(struct amdgpu_device *adev,

Re: [PATCH] drm/amdgpu: Don't propagate errors from amdgpu_vm_flush()

Reply via email to