Public > -----Original Message----- > From: Christian König <[email protected]> > Sent: Tuesday, May 5, 2026 9:40 AM > To: [email protected]; Deucher, Alexander > <[email protected]> > Subject: [PATCH] drm/amdgpu: deprecate guilty handling > > The guilty handling tried to establish a second way of signaling problems with > the GPU back to userspace. This caused quite a bunch of issue we had to work > around, especially lifetime issues with the drm_sched_entity. > > Just drop the handling altogether and use the dma_fence based approach > instead. > > Signed-off-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 5 ----- > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 25 > ++++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 - > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +------- > 4 files changed, 24 insertions(+), 16 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 10d8dcc3a972..d67c87fbf371 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -60,11 +60,6 @@ static int amdgpu_cs_parser_init(struct > amdgpu_cs_parser *p, > if (!p->ctx) > return -EINVAL; > > - if (atomic_read(&p->ctx->guilty)) { > - amdgpu_ctx_put(p->ctx); > - return -ECANCELED; > - } > - > amdgpu_sync_create(&p->sync); > drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT | > DRM_EXEC_IGNORE_DUPLICATES, 0); diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index c273557fb1ae..cfb24e5d065e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -256,7 +256,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx > *ctx, u32 hw_ip, > } > > r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, > num_scheds, > - &ctx->guilty); > + NULL); > if (r) > goto error_free_entity; > > @@ -580,6 +580,27 @@ static int amdgpu_ctx_query(struct amdgpu_device > *adev, > > #define AMDGPU_RAS_COUNTE_DELAY_MS 3000 > > +static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx) { > + int i, j, r; > + > + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { > + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { > + struct amdgpu_ctx_entity *ctx_entity; > + > + ctx_entity = ctx->entities[i][j]; > + if (ctx_entity) > + continue; > + > + r == drm_sched_entity_error(&ctx_entity->entity);
Extra = here. Other than that: Reviewed-by: Alex Deucher <[email protected]> > + if (r == -ETIME) > + return true; > + } > + } > + > + return false; > +} > + > static int amdgpu_ctx_query2(struct amdgpu_device *adev, > struct amdgpu_fpriv *fpriv, uint32_t id, > union drm_amdgpu_ctx_out *out) > @@ -608,7 +629,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device > *adev, > if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) > out->state.flags |= > AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; > > - if (atomic_read(&ctx->guilty)) > + if (amdgpu_ctx_guilty(ctx)) > out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; > > if (amdgpu_in_reset(adev)) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > index cf8d700a22fe..e444b2088d40 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > @@ -50,7 +50,6 @@ struct amdgpu_ctx { > int32_t init_priority; > int32_t override_priority; > uint32_t stable_pstate; > - atomic_t guilty; > bool preamble_presented; > uint64_t generation; > unsigned long ras_counter_ce; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 073f632f295a..1536d40bb362 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5092,14 +5092,10 @@ int amdgpu_device_link_reset(struct > amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct > amdgpu_device *adev, > struct amdgpu_reset_context > *reset_context) { > - int i, r = 0; > - struct amdgpu_job *job = NULL; > struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; > bool need_full_reset = > test_bit(AMDGPU_NEED_FULL_RESET, &reset_context- > >flags); > - > - if (reset_context->reset_req_dev == adev) > - job = reset_context->job; > + int i, r; > > if (amdgpu_sriov_vf(adev)) > amdgpu_virt_pre_reset(adev); > @@ -5119,9 +5115,6 @@ int amdgpu_device_pre_asic_reset(struct > amdgpu_device *adev, > > amdgpu_fence_driver_isr_toggle(adev, false); > > - if (job && job->vm) > - drm_sched_increase_karma(&job->base); > - > r = amdgpu_reset_prepare_hwcontext(adev, reset_context); > /* If reset handler not implemented, continue; otherwise return */ > if (r == -EOPNOTSUPP) > -- > 2.43.0
