On Wed, Nov 19, 2025 at 02:41:04PM -0800, Matthew Brost wrote:
> Deregistering queues in the TDR introduces unnecessary complexity,
> requiring reference-counting techniques to function correctly,
> particularly to prevent use-after-free (UAF) issues while a
> deregistration initiated from the TDR is in progress.
>
> All that's needed in the TDR is to kick the queue off the hardware,
> which is achieved by disabling scheduling. Queue deregistration should
> be handled in a single, well-defined point in the cleanup path, tied to
> the queue's reference count.
>
> v4:
> - Explain why extra ref were needed prior to this patch (Niranjana)
>
> Signed-off-by: Matthew Brost <[email protected]>
> ---
> drivers/gpu/drm/xe/xe_guc_submit.c | 65 +++++-------------------------
> 1 file changed, 9 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 648c9ea06749..5de300b66767 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -69,9 +69,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
> #define EXEC_QUEUE_STATE_WEDGED (1 << 8)
> #define EXEC_QUEUE_STATE_BANNED (1 << 9)
> #define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 10)
> -#define EXEC_QUEUE_STATE_EXTRA_REF (1 << 11)
> -#define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 12)
> -#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 13)
> +#define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 11)
> +#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 12)
>
> static bool exec_queue_registered(struct xe_exec_queue *q)
> {
> @@ -218,21 +217,6 @@ static void clear_exec_queue_check_timeout(struct
xe_exec_queue *q)
> atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
> }
>
> -static bool exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> - return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_EXTRA_REF;
> -}
> -
> -static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> - atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
> -}
> -
> -static void clear_exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> - atomic_and(~EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
> -}
> -
> static bool exec_queue_pending_resume(struct xe_exec_queue *q)
> {
> return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_RESUME;
> @@ -1190,25 +1174,6 @@ static void disable_scheduling(struct xe_exec_queue
*q, bool immediate)
> G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
> }
>
> -static void __deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue
*q)
> -{
> - u32 action[] = {
> - XE_GUC_ACTION_DEREGISTER_CONTEXT,
> - q->guc->id,
> - };
> -
> - xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
> - xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
> - xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
> - xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
> -
> - set_exec_queue_destroyed(q);
> - trace_xe_exec_queue_deregister(q);
> -
> - xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
> - G2H_LEN_DW_DEREGISTER_CONTEXT, 1);
> -}
> -
> static enum drm_gpu_sched_stat
> guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> {
> @@ -1225,6 +1190,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job
*drm_job)
> bool wedged = false, skip_timeout_check;
>
> xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));
> + xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
Is it always guaranteed? What if we get here because TDR is triggered
by some error notification from the GuC and befor we get here, the
exec_queue gets destroyed in the CLEANUP message handler? I am not
sure we can we be sure here that it will be race proof.