Public

> -----Original Message-----
> From: Christian König <[email protected]>
> Sent: Tuesday, May 5, 2026 9:40 AM
> To: [email protected]; Deucher, Alexander
> <[email protected]>
> Subject: [PATCH] drm/amdgpu: deprecate guilty handling
>
> The guilty handling tried to establish a second way of signaling problems with
> the GPU back to userspace. This caused quite a bunch of issue we had to work
> around, especially lifetime issues with the drm_sched_entity.
>
> Just drop the handling altogether and use the dma_fence based approach
> instead.
>
> Signed-off-by: Christian König <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  5 -----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c    | 25
> ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h    |  1 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  9 +-------
>  4 files changed, 24 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 10d8dcc3a972..d67c87fbf371 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -60,11 +60,6 @@ static int amdgpu_cs_parser_init(struct
> amdgpu_cs_parser *p,
>       if (!p->ctx)
>               return -EINVAL;
>
> -     if (atomic_read(&p->ctx->guilty)) {
> -             amdgpu_ctx_put(p->ctx);
> -             return -ECANCELED;
> -     }
> -
>       amdgpu_sync_create(&p->sync);
>       drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
>                     DRM_EXEC_IGNORE_DUPLICATES, 0); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index c273557fb1ae..cfb24e5d065e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -256,7 +256,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx
> *ctx, u32 hw_ip,
>       }
>
>       r = drm_sched_entity_init(&entity->entity, drm_prio, scheds,
> num_scheds,
> -                               &ctx->guilty);
> +                               NULL);
>       if (r)
>               goto error_free_entity;
>
> @@ -580,6 +580,27 @@ static int amdgpu_ctx_query(struct amdgpu_device
> *adev,
>
>  #define AMDGPU_RAS_COUNTE_DELAY_MS 3000
>
> +static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx) {
> +     int i, j, r;
> +
> +     for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +                     struct amdgpu_ctx_entity *ctx_entity;
> +
> +                     ctx_entity = ctx->entities[i][j];
> +                     if (ctx_entity)
> +                             continue;
> +
> +                     r == drm_sched_entity_error(&ctx_entity->entity);

Extra = here.  Other than that:
Reviewed-by: Alex Deucher <[email protected]>

> +                     if (r == -ETIME)
> +                             return true;
> +             }
> +     }
> +
> +     return false;
> +}
> +
>  static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>                            struct amdgpu_fpriv *fpriv, uint32_t id,
>                            union drm_amdgpu_ctx_out *out)
> @@ -608,7 +629,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device
> *adev,
>       if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
>               out->state.flags |=
> AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
>
> -     if (atomic_read(&ctx->guilty))
> +     if (amdgpu_ctx_guilty(ctx))
>               out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>
>       if (amdgpu_in_reset(adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index cf8d700a22fe..e444b2088d40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -50,7 +50,6 @@ struct amdgpu_ctx {
>       int32_t                         init_priority;
>       int32_t                         override_priority;
>       uint32_t                        stable_pstate;
> -     atomic_t                        guilty;
>       bool                            preamble_presented;
>       uint64_t                        generation;
>       unsigned long                   ras_counter_ce;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 073f632f295a..1536d40bb362 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5092,14 +5092,10 @@ int amdgpu_device_link_reset(struct
> amdgpu_device *adev)  int amdgpu_device_pre_asic_reset(struct
> amdgpu_device *adev,
>                                struct amdgpu_reset_context
> *reset_context)  {
> -     int i, r = 0;
> -     struct amdgpu_job *job = NULL;
>       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>       bool need_full_reset =
>               test_bit(AMDGPU_NEED_FULL_RESET, &reset_context-
> >flags);
> -
> -     if (reset_context->reset_req_dev == adev)
> -             job = reset_context->job;
> +     int i, r;
>
>       if (amdgpu_sriov_vf(adev))
>               amdgpu_virt_pre_reset(adev);
> @@ -5119,9 +5115,6 @@ int amdgpu_device_pre_asic_reset(struct
> amdgpu_device *adev,
>
>       amdgpu_fence_driver_isr_toggle(adev, false);
>
> -     if (job && job->vm)
> -             drm_sched_increase_karma(&job->base);
> -
>       r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
>       /* If reset handler not implemented, continue; otherwise return */
>       if (r == -EOPNOTSUPP)
> --
> 2.43.0

Reply via email to