[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Zhang, 
Jesse(Jie)
Sent: Thursday, August 28, 2025 4:16 PM
To: Alex Deucher <alexdeuc...@gmail.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
<alexander.deuc...@amd.com>; Koenig, Christian <christian.koe...@amd.com>
Subject: RE: [v12 11/11] drm/amdgpu: Implement user queue reset functionality

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Alex Deucher <alexdeuc...@gmail.com>
Sent: Tuesday, August 26, 2025 10:49 PM
To: Zhang, Jesse(Jie) <jesse.zh...@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
<alexander.deuc...@amd.com>; Koenig, Christian <christian.koe...@amd.com>
Subject: Re: [v12 11/11] drm/amdgpu: Implement user queue reset functionality

On Mon, Aug 25, 2025 at 10:43 PM Jesse.Zhang <jesse.zh...@amd.com> wrote:
>
> From: Alex Deucher <alexander.deuc...@amd.com>
>
> This patch adds robust reset handling for user queues (userq) to
> improve recovery from queue failures. The key components include:
>
> 1. Queue detection and reset logic:
>    - amdgpu_userq_detect_and_reset_queues() identifies failed queues
>    - Per-IP detect_and_reset callbacks for targeted recovery
>    - Falls back to full GPU reset when needed
>
> 2. Reset infrastructure:
>    - Adds userq_reset_work workqueue for async reset handling
>    - Implements pre/post reset handlers for queue state management
>    - Integrates with existing GPU reset framework
>
> 3. Error handling improvements:
>    - Enhanced state tracking with HUNG state
>    - Automatic reset triggering on critical failures
>    - VRAM loss handling during recovery
>
> 4. Integration points:
>    - Added to device init/reset paths
>    - Called during queue destroy, suspend, and isolation events
>    - Handles both individual queue and full GPU resets
>
> The reset functionality works with both compute and graphics queues,
> providing better resilience against queue failures while minimizing
> disruption to unaffected queues.
>
> v2: Separate SDMA and GFX/COMPUTE queue reset handling
>
> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   8 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 215 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |   5 +
>  4 files changed, 216 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index ef3af170dda4..9db05cdc7304 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1302,6 +1302,7 @@ struct amdgpu_device {
>         struct list_head                userq_mgr_list;
>         struct mutex                    userq_mutex;
>         bool                            userq_halt_for_enforce_isolation;
> +       struct work_struct              userq_reset_work;
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3757634613c3..1dc88b0055dd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>
>         INIT_WORK(&adev->xgmi_reset_work,
> amdgpu_device_xgmi_reset_func);
> +       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>
>         adev->gfx.gfx_off_req_count = 1;
>         adev->gfx.gfx_off_residency = 0; @@ -5880,6 +5881,10 @@ int
> amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
>                                 if (r)
>                                         goto out;
>
> +                               r = amdgpu_userq_post_reset(tmp_adev, 
> vram_lost);
> +                               if (r)
> +                                       goto out;
> +
>
> drm_client_dev_resume(adev_to_drm(tmp_adev), false);
>
>                                 /*
> @@ -6102,6 +6107,7 @@ static inline void 
> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>         if (!amdgpu_sriov_vf(adev))
>                 cancel_work(&adev->reset_work);  #endif
> +       cancel_work(&adev->userq_reset_work);
>
>         if (adev->kfd.dev)
>                 cancel_work(&adev->kfd.reset_work);
> @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct 
> amdgpu_device *adev,
>                       amdgpu_device_ip_need_full_reset(tmp_adev))
>                         amdgpu_ras_suspend(tmp_adev);
>
> +               amdgpu_userq_pre_reset(tmp_adev);
> +
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index decedf8057ac..996c7397a57f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -25,8 +25,10 @@
>  #include <drm/drm_auth.h>
>  #include <drm/drm_exec.h>
>  #include <linux/pm_runtime.h>
> +#include <drm/drm_drv.h>
>
>  #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_vm.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> @@ -44,6 +46,102 @@ u32 amdgpu_userq_get_supported_ip_mask(struct 
> amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) {
> +
> +       if (amdgpu_device_should_recover_gpu(adev)) {
> +               amdgpu_reset_domain_schedule(adev->reset_domain,
> +                                            &adev->userq_reset_work);
> +               /* Wait for the reset job to complete */
> +               flush_work(&adev->userq_reset_work);
> +       }
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_sdma(struct amdgpu_userq_mgr *uq_mgr) {
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const struct amdgpu_userq_funcs *userq_sdma_funcs =
> +                       adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
> +       struct amdgpu_usermode_queue *userq;
> +       bool has_sdma = false;
> +       bool gpu_reset = false;
> +       int id, r = 0;
> +
> +       /* Detect if SDMA queues are present */
> +       idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
> +               if (userq->queue_type == AMDGPU_RING_TYPE_SDMA) {
> +                       has_sdma = true;
> +                       break;
> +               }
> +       }
> +
> +       if (has_sdma && userq_sdma_funcs && 
> userq_sdma_funcs->detect_and_reset) {
> +               r = userq_sdma_funcs->detect_and_reset(adev, 
> AMDGPU_RING_TYPE_SDMA);
> +               if (r)
> +                       gpu_reset = true;
> +       }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
> +       return r;
> +}
> +
> +static int
> +amdgpu_userq_detect_and_reset_gfx_compute(struct amdgpu_userq_mgr
> +*uq_mgr) {
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       const struct amdgpu_userq_funcs *userq_gfx_funcs =
> +                       adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
> +       const struct amdgpu_userq_funcs *userq_compute_funcs =
> +                       adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
> +       struct amdgpu_usermode_queue *userq;
> +       bool has_gfx = false, has_compute = false;
> +       bool gpu_reset = false;
> +       int id, r = 0;
> +
> +       /* Detect which types of queues are present (excluding SDMA) */
> +       idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
> +               switch (userq->queue_type) {
> +               case AMDGPU_RING_TYPE_GFX:
> +                       has_gfx = true;
> +                       break;
> +               case AMDGPU_RING_TYPE_COMPUTE:
> +                       has_compute = true;
> +                       break;
> +               default:
> +                       break;
> +               }
> +       }
> +
> +       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +               dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +       } else if (amdgpu_gpu_recovery) {
> +               if (has_compute && userq_compute_funcs->detect_and_reset) {
> +                       r = userq_compute_funcs->detect_and_reset(adev, 
> AMDGPU_RING_TYPE_COMPUTE);
> +                       if (r) {
> +                               gpu_reset = true;
> +                               goto gpu_reset;
> +                       }
> +               }
> +
> +               if (has_gfx && userq_gfx_funcs->detect_and_reset) {
> +                       r = userq_gfx_funcs->detect_and_reset(adev, 
> AMDGPU_RING_TYPE_GFX);
> +                       if (r) {
> +                               gpu_reset = true;
> +                               goto gpu_reset;
> +                       }
> +               }
> +       }
> +
> +gpu_reset:
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
> +       return r;
> +}

> Why split these into two functions?  I think it would be more
efficient to just have a single function
(amdgpu_userq_detect_and_reset()) that works at the device level and checks to 
see what types of user queues are present and then calls
detect_and_reset() for each queue type that is present and then have that 
function update the queue state of all queues that are hung.
Then just call this function in all the places we call preempt or unmap.  E.g.,

> amdgpu_userq_detect_and_reset()
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
    /* evict, unmap, */
}

> We probably also need some way to determine whether the queue reset
was successful or not because we ultimately need to decide when to do a full 
adapter reset.

Thanks Alex.
You are right, I would revert the changes and call it everywhere we call 
preempt and unmap, including when preempt/unmap fails.

Jesse

>Hi Alex,

For SDMA, preemption on the GFX12 always succeeds even if the SDMA queue is 
pending. Therefore, we must detect the hung queues before preempting/unmapping.

For compute/GFX queues, the API does not detect the hung queues before 
preempting/unmapping.
However, if a GFX/compute queue is hung, preemption will fail. After 
preemption/unmapping fails, we call the detection and reset API, and MES resets 
the unmapped queue.

For this reason, I've split these into two functions.

Thanks
Jesse


Alex

> +
>  static int
>  amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
>                           struct amdgpu_usermode_queue *queue) @@
> -56,6 +154,7 @@ amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr 
> *uq_mgr,
>         if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
>                 r = userq_funcs->preempt(uq_mgr, queue);
>                 if (r) {
> +
> + amdgpu_userq_detect_and_reset_gfx_compute(uq_mgr);
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
> @@ -72,17 +171,21 @@ amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr 
> *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
>                 r = userq_funcs->restore(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       gpu_reset = true;
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
>
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
>         return r;
>  }
>
> @@ -98,11 +201,14 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr 
> *uq_mgr,
>         if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
>                 (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
>                 r = userq_funcs->unmap(uq_mgr, queue);
> -               if (r)
> +               if (r) {
> +
> + amdgpu_userq_detect_and_reset_gfx_compute(uq_mgr);
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> -               else
> +               } else {
>                         queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
> +               }
>         }
> +
>         return r;
>  }
>
> @@ -113,16 +219,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
>         struct amdgpu_device *adev = uq_mgr->adev;
>         const struct amdgpu_userq_funcs *userq_funcs =
>                 adev->userq_funcs[queue->queue_type];
> +       bool gpu_reset = false;
>         int r = 0;
>
>         if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
>                 r = userq_funcs->map(uq_mgr, queue);
>                 if (r) {
>                         queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                       gpu_reset = true;
>                 } else {
>                         queue->state = AMDGPU_USERQ_STATE_MAPPED;
>                 }
>         }
> +
> +       if (gpu_reset)
> +               amdgpu_userq_gpu_reset(adev);
> +
>         return r;
>  }
>
> @@ -361,6 +473,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>                 amdgpu_bo_unreserve(queue->db_obj.obj);
>         }
>         amdgpu_bo_unref(&queue->db_obj.obj);
> +       amdgpu_userq_detect_and_reset_sdma(uq_mgr);
>         r = amdgpu_userq_unmap_helper(uq_mgr, queue);
>         amdgpu_userq_cleanup(uq_mgr, queue, queue_id);
>         mutex_unlock(&uq_mgr->userq_mutex);
> @@ -734,6 +847,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         int queue_id;
>         int ret = 0, r;
>
> +       amdgpu_userq_detect_and_reset_sdma (uq_mgr);
>         /* Try to unmap all the queues in this process ctx */
>         idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
>                 r = amdgpu_userqueue_preempt_helper(uq_mgr, queue); @@
> -746,6 +860,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>         return ret;
>  }
>
> +void amdgpu_userq_reset_work(struct work_struct *work) {
> +       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +                                                 userq_reset_work);
> +       struct amdgpu_reset_context reset_context;
> +
> +       memset(&reset_context, 0, sizeof(reset_context));
> +
> +       reset_context.method = AMD_RESET_METHOD_NONE;
> +       reset_context.reset_req_dev = adev;
> +       reset_context.src = AMDGPU_RESET_SRC_USERQ;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
> +
> +       amdgpu_device_gpu_recover(adev, NULL, &reset_context); }
> +
>  static int
>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)  { @@
> -772,22 +903,19 @@ void  amdgpu_userq_evict(struct amdgpu_userq_mgr
> *uq_mgr,
>                    struct amdgpu_eviction_fence *ev_fence)  {
> -       int ret;
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>         struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
> +       struct amdgpu_device *adev = uq_mgr->adev;
> +       int ret;
>
>         /* Wait for any pending userqueue fence work to finish */
>         ret = amdgpu_userq_wait_for_signal(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout 
> waiting for work\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Not evicting userqueue, timeout
> + waiting for work\n");
>
>         ret = amdgpu_userq_evict_all(uq_mgr);
> -       if (ret) {
> -               drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
> -               return;
> -       }
> +       if (ret)
> +               dev_err(adev->dev, "Failed to evict userqueue\n");
>
>         /* Signal current eviction fence */
>         amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -828,6
> +956,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr
> *userq_mgr)
>
>         mutex_lock(&adev->userq_mutex);
>         mutex_lock(&userq_mgr->userq_mutex);
> +       amdgpu_userq_detect_and_reset_sdma(userq_mgr);
>         idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
>                 amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
>                 amdgpu_userq_unmap_helper(userq_mgr, queue); @@
> -861,10 +990,13 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
>         list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 mutex_lock(&uqm->userq_mutex);
> +               amdgpu_userq_detect_and_reset_sdma(uqm);
>                 idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
>                         r = amdgpu_userq_unmap_helper(uqm, queue);
> -                       if (r)
> +                       if (r) {
> +
> + amdgpu_userq_detect_and_reset_gfx_compute(uqm);
>                                 ret = r;
> +                       }
>                 }
>                 mutex_unlock(&uqm->userq_mutex);
>         }
> @@ -917,13 +1049,16 @@ int 
> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
>         list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
>                 cancel_delayed_work_sync(&uqm->resume_work);
>                 mutex_lock(&uqm->userq_mutex);
> +               amdgpu_userq_detect_and_reset_sdma(uqm);
>                 idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
>                         if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
>                              (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
>                             (queue->xcp_id == idx)) {
>                                 r = amdgpu_userq_unmap_helper(uqm, queue);
> -                               if (r)
> +                               if (r) {
> +
> + amdgpu_userq_detect_and_reset_gfx_compute(uqm);
>                                         ret = r;
> +                               }
>                         }
>                 }
>                 mutex_unlock(&uqm->userq_mutex); @@ -965,3 +1100,57 @@
> int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>         mutex_unlock(&adev->userq_mutex);
>         return ret;
>  }
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) {
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       struct amdgpu_usermode_queue *queue;
> +       struct amdgpu_userq_mgr *uqm, *tmp;
> +       int queue_id;
> +
> +       list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> +               cancel_delayed_work_sync(&uqm->resume_work);
> +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +                       if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> +                               amdgpu_userq_wait_for_last_fence(uqm, queue);
> +                               userq_funcs = 
> adev->userq_funcs[queue->queue_type];
> +                               userq_funcs->unmap(uqm, queue);
> +                               /* just mark all queues as hung at this point.
> +                                * if unmap succeeds, we could map again
> +                                * in amdgpu_userq_post_reset() if vram is 
> not lost
> +                                */
> +                               queue->state = AMDGPU_USERQ_STATE_HUNG;
> +                               
> amdgpu_userq_fence_driver_force_completion(queue);
> +                       }
> +               }
> +       }
> +}
> +
> +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool
> +vram_lost) {
> +       /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
> +        * at this point, we should be able to map it again
> +        * and continue if vram is not lost.
> +        */
> +       struct amdgpu_userq_mgr *uqm;
> +       struct amdgpu_usermode_queue *queue;
> +       const struct amdgpu_userq_funcs *userq_funcs;
> +       int queue_id, r = 0;
> +
> +       list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
> +               idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> +                       if (queue->state == AMDGPU_USERQ_STATE_HUNG && 
> !vram_lost) {
> +                               userq_funcs = 
> adev->userq_funcs[queue->queue_type];
> +                               /* re-map queue */
> +                               r = userq_funcs->map(uqm, queue);
> +                               if (r) {
> +                                       dev_err(adev->dev, "Failed to remap 
> queue %d\n", queue_id);
> +                                       continue;
> +                               }
> +                               queue->state = AMDGPU_USERQ_STATE_MAPPED;
> +                       }
> +               }
> +       }
> +
> +       return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 9fa0d1a88d71..e68bb144b26f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -138,4 +138,9 @@ int
> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,  
> int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
>                                                    u32 idx);
>
> +void amdgpu_userq_reset_work(struct work_struct *work);
> +
> +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); int
> +amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +
>  #endif
> --
> 2.49.0
>

Reply via email to