gfx: add a common helper to handle MES compute resets

Zhang, Jesse(Jie) Mon, 25 May 2026 01:18:48 -0700

AMD General

> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Alex
> Deucher
> Sent: Friday, May 22, 2026 8:21 AM
> To: [email protected]; Koenig, Christian
> <[email protected]>; Khatri, Sunil <[email protected]>; Lin, Amber
> <[email protected]>; Zhang, Jesse(Jie) <[email protected]>; Liu,
> Shaoyun <[email protected]>
> Cc: Deucher, Alexander <[email protected]>; Zhang, Jesse(Jie)
> <[email protected]>
> Subject: [PATCH 37/42] drm/amdgpu/gfx: add a common helper to handle MES
> compute resets
>
> Add helpers to handle MES compute queue resets when multiple queues are
> affected.  Can you be used by both KGD and KFD.
>
> v2: sqaush in updates
> v3: squash in userq updates
>
> Co-developed-by: Jesse Zhang <[email protected]>
> Co-developed-by: Amber Lin <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> Signed-off-by: Alex Deucher <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   9 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |   6 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c  |   2 +
>  6 files changed, 160 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index de8c85dfc4c62..960d192076de8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -34,6 +34,7 @@
>  #include "amdgpu_xcp.h"
>  #include "amdgpu_xgmi.h"
>  #include "amdgpu_mes.h"
> +#include "mes_userqueue.h"
>  #include "nvd.h"
>
>  /* delay 0.1 second to enable gfx off feature */ @@ -1976,15 +1977,25 @@ int
> amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
>                              bool use_mmio)
>  {
>       struct amdgpu_device *adev = ring->adev;
> +     bool reinit_queue;
>       int r;
>
[Zhang, Jesse(Jie)]  The *_pipe_reset_enable` property is always 0. Should set 
the flag ?


> +     if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
> +         adev->mes.compute_pipe_reset_enabled)
> +             reinit_queue = true;
> +     else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
> +              adev->mes.gfx_pipe_reset_enabled)
> +             reinit_queue = true;
> +     else
> +             reinit_queue = use_mmio;
> +
>       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
>
>       r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
>       if (r)
>               return r;
>
> -     if (use_mmio) {
> +     if (reinit_queue) {
>               r = amdgpu_mes_unmap_legacy_queue(adev, ring,
>                                                 RESET_QUEUES, 0, 0, 0);
>               if (r)
> @@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device
> *adev)
>       }
>  }
>
> +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device
> *adev,
> +                                               struct amdgpu_ring 
> *guilty_ring) {
> +     struct amdgpu_ring *ring;
> +     int i;
> +
> +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +             ring = &adev->gfx.compute_ring[i];
> +             if (ring == guilty_ring)
> +                     continue;
> +             drm_sched_wqueue_start(&ring->sched);
> +     }
> +}
> +
> +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device
> *adev,
> +                                              struct amdgpu_ring 
> *guilty_ring) {
> +     struct amdgpu_ring *ring;
> +     int i;
> +
> +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +             ring = &adev->gfx.compute_ring[i];
> +             if (ring == guilty_ring)
> +                     continue;
> +             drm_sched_wqueue_stop(&ring->sched);
> +     }
> +}
> +
> +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
> +                                 struct amdgpu_ring *guilty_ring,
> +                                 unsigned int db)
> +{
> +     bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> +     struct amdgpu_fence *fence;
> +     struct amdgpu_ring *ring;
> +     int i, r;
> +
> +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +             ring = &adev->gfx.compute_ring[i];
> +             if (ring == guilty_ring)
> +                     continue;
> +             if (ring->doorbell_index == db) {
> +                     fence = amdgpu_ring_find_guilty_fence(ring);
> +                     r = amdgpu_gfx_mes_reset_queue(ring, 0, fence,
> use_mmio);
> +                     if (r)
> +                             return r;
> +                     break;
> +             }
> +     }
> +     return 0;
> +}
> +
> +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> +                              struct amdgpu_ring *ring,
> +                              struct amdgpu_fence *guilty_fence,
> +                              struct amdgpu_usermode_queue *uq,
> +                              unsigned int *hung_queue_count)
> +{
> +     struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
> +             (struct amdgpu_mes_hung_queue_hqd_info *)
> +             &adev->gfx.mec.mes_hung_db_array[adev-
> >mes.hung_queue_hqd_info_offset];
> +     int i, r, pipe, queue, queue_type;
> +     unsigned int num_hung = 0;
> +     bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> +
> +     guard(mutex)(&adev->gfx.mec.reset_mutex);
> +     /* stop the drm schedulers for all compute queues */
> +     amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
> +     /* suspend all will determine which queues are hung.
> +      * reset detect will return the array of bad queue doorbells
> +      */
> +     r = amdgpu_mes_suspend(adev, 0);
> +     /* if suspend all success, it should no hang queue */
> +     if (!r)
> +             /* always reset the KCQ/userq since we need to signal the fence
> +              * and we could be stuck in a loop which is preemptable.
> +              */
> +             goto fence_reset;
> +     r = amdgpu_mes_detect_and_reset_hung_queues(adev,
> AMDGPU_RING_TYPE_COMPUTE,
> +                                                 true, &num_hung, adev-
> >gfx.mec.mes_hung_db_array, 0);
> +     if (r)
> +             goto out;
> +     if (hung_queue_count)
> +             *hung_queue_count = num_hung;
> +
> +fence_reset:
> +     /* reset the queue this came from if specified */
> +     if (ring) {
> +             r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio);
> +             if (r)
> +                     goto out;
> +     }
> +     if (uq) {
> +             r = mes_userq_reset(uq);
> +             if (r)
> +                     goto out;
> +     }
> +     for (i = 0; i < num_hung; i++) {
> +             pipe = hqd_info[i].pipe_index;
> +             queue = hqd_info[i].queue_index;
> +             queue_type = hqd_info[i].queue_type;
> +
> +             /* reset any KCQs */
> +             r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> +                                          
> adev->gfx.mec.mes_hung_db_array[i]);
> +             if (r)
> +                     goto out;
> +             /* reset any KFD queues */
> +             r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe,
> queue,
> +                                               adev-
> >gfx.mec.mes_hung_db_array[i]);
> +             if (r)
> +                     goto out;
> +             /* reset KGD user queues */
> +             r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
> +                                       adev->gfx.mec.mes_hung_db_array[i]);
> +             if (r)
> +                     goto out;
> +     }
> +out:
> +     /* resume all will enable the non-hung queues */
> +     amdgpu_mes_resume(adev, 0);
> +     if (!r)
> +             amdgpu_gfx_reset_start_compute_scheds(adev, ring);
> +
> +     return r;
> +}
> +
>  int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
>                                     unsigned int cleaner_shader_size)  { diff 
> --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f9175faa64ab7..8ef2ef394e9af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -36,6 +36,8 @@
>  #include "amdgpu_ring_mux.h"
>  #include "amdgpu_xcp.h"
>
> +struct amdgpu_usermode_queue;
> +
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE                       0x00000000L
>  #define AMDGPU_GFX_SAFE_MODE                 0x00000001L
> @@ -117,6 +119,8 @@ struct amdgpu_mec {
>       u32 num_queue_per_pipe;
>       void                    *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS *
> AMDGPU_MAX_GC_INSTANCES];
>       bool use_mmio_for_reset;
> +     u32 *mes_hung_db_array;
> +     struct mutex            reset_mutex;
>  };
>
>  struct amdgpu_mec_bitmap {
> @@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct
> amdgpu_device *adev,  bool amdgpu_gfx_is_master_xcc(struct amdgpu_device
> *adev, int xcc_id);  int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);  
> void
> amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
> +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> +                              struct amdgpu_ring *ring,
> +                              struct amdgpu_fence *guilty_fence,
> +                              struct amdgpu_usermode_queue *uq,
> +                              unsigned int *hung_queue_count);
>  void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
>               void *ras_error_status,
>               void (*func)(struct amdgpu_device *adev, void 
> *ras_error_status, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 370e8d159b6fe..ec4d9a1e029a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>               }
>       }
>
> +     adev->gfx.mec.mes_hung_db_array =
> +             kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
> +                     sizeof(u32), GFP_KERNEL);
> +
>       return 0;
>
>  error_doorbell:
> @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>       int i;
>       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) :
> 1;
>
> +     kfree(adev->gfx.mec.mes_hung_db_array);
> +
>       amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
>                             &adev->mes.event_log_gpu_addr,
>                             &adev->mes.event_log_cpu_addr); diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 1a214c274ad02..32e01eb311c3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
>       adev->gfx.me.use_mmio_for_reset = false;
>       adev->gfx.mec.use_mmio_for_reset = true;
>
> +     mutex_init(&adev->gfx.mec.reset_mutex);
> +
>       return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 5beb0ae980d0b..247bcb7034e19 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
>       adev->gfx.me.use_mmio_for_reset = false;
>       adev->gfx.mec.use_mmio_for_reset = true;
>
> +     mutex_init(&adev->gfx.mec.reset_mutex);
> +
>       return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> index 033f15e21ad33..7f8e43130bd28 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block
> *ip_block)
>       if (r)
>               return r;
>
> +     mutex_init(&adev->gfx.mec.reset_mutex);
> +
>       return 0;
>  }
>
> --
> 2.54.0

RE: [PATCH 37/42] drm/amdgpu/gfx: add a common helper to handle MES compute resets

Reply via email to