On 2/8/26 07:27, Srinivasan Shanmugam wrote: > amdgpu_vm_flush() only returns an error when amdgpu_fence_emit() fails.
You need to take a step further amdgpu_fence_emit() can't fail either. The fallback wait inside that function blocks forever until the fence signals and that should never happen in the job submission path in the first place. Regards, Christian. > That failure is not expected and the caller should not have to handle > it. > > Handle amdgpu_fence_emit() failure inside amdgpu_vm_flush() by undoing > the ring and return 0. Drop the return value handling in > amdgpu_ib_schedule(). > > Cc: Alex Deucher <[email protected]> > Suggested-by: Christian König <[email protected]> > Signed-off-by: Srinivasan Shanmugam <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 9 ++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 25 +++++++++++++++---------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- > 3 files changed, 18 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > index 44f230d67da2..e763b2c1a386 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > @@ -225,13 +225,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, > unsigned int num_ibs, > if (ring->funcs->insert_start) > ring->funcs->insert_start(ring); > > - if (job) { > - r = amdgpu_vm_flush(ring, job, need_pipe_sync); > - if (r) { > - amdgpu_ring_undo(ring); > - return r; > - } > - } > + if (job) > + amdgpu_vm_flush(ring, job, need_pipe_sync); > > amdgpu_ring_ib_begin(ring); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 807f8bcc7de5..2698a3bf7970 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -764,12 +764,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring > *ring, > * @need_pipe_sync: is pipe sync needed > * > * Emit a VM flush when it is necessary. > - * > - * Returns: > - * 0 on success, errno otherwise. > */ > -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, > - bool need_pipe_sync) > +void amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, > + bool need_pipe_sync) > { > struct amdgpu_device *adev = ring->adev; > struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; > @@ -783,7 +780,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct > amdgpu_job *job, > bool cleaner_shader_needed = false; > bool pasid_mapping_needed = false; > struct dma_fence *fence = NULL; > - unsigned int patch; > + unsigned int patch = 0; > int r; > > if (amdgpu_vmid_had_gpu_reset(adev, id)) { > @@ -812,7 +809,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct > amdgpu_job *job, > > if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync && > !cleaner_shader_needed) > - return 0; > + return; > > amdgpu_ring_ib_begin(ring); > if (ring->funcs->init_cond_exec) > @@ -846,8 +843,17 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct > amdgpu_job *job, > > if (vm_flush_needed || pasid_mapping_needed || cleaner_shader_needed) { > r = amdgpu_fence_emit(ring, job->hw_vm_fence, 0); > - if (r) > - return r; > + if (WARN_ON_ONCE(r)) { > + /* > + * amdgpu_vm_flush() is not expected to fail. > + * The only possible error originates from > amdgpu_fence_emit(). > + * If it ever fails, undo ring writes and don't > propagate > + * an error to callers. > + */ > + dev_err(adev->dev, "failed to emit VM fence (%d)\n", r); > + amdgpu_ring_undo(ring); > + return; > + } > fence = &job->hw_vm_fence->base; > /* get a ref for the job */ > dma_fence_get(fence); > @@ -892,7 +898,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct > amdgpu_job *job, > } > > amdgpu_ring_ib_end(ring); > - return 0; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 139642eacdd0..322193b97d3c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -512,7 +512,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct > amdgpu_vm *vm, > struct ww_acquire_ctx *ticket, > int (*callback)(void *p, struct amdgpu_bo *bo), > void *param); > -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool > need_pipe_sync); > +void amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool > need_pipe_sync); > int amdgpu_vm_update_pdes(struct amdgpu_device *adev, > struct amdgpu_vm *vm, bool immediate); > int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
