On Mon, May 25, 2026 at 4:34 AM Zhang, Jesse(Jie) <[email protected]> wrote:
>
> AMD General
>
> > -----Original Message-----
> > From: amd-gfx <[email protected]> On Behalf Of Alex
> > Deucher
> > Sent: Friday, May 22, 2026 8:21 AM
> > To: [email protected]; Koenig, Christian
> > <[email protected]>; Khatri, Sunil <[email protected]>; Lin, Amber
> > <[email protected]>; Zhang, Jesse(Jie) <[email protected]>; Liu,
> > Shaoyun <[email protected]>
> > Cc: Deucher, Alexander <[email protected]>; Zhang, Jesse(Jie)
> > <[email protected]>
> > Subject: [PATCH 37/42] drm/amdgpu/gfx: add a common helper to handle MES
> > compute resets
> >
> > Add helpers to handle MES compute queue resets when multiple queues are
> > affected.  Can you be used by both KGD and KFD.
> >
> > v2: sqaush in updates
> > v3: squash in userq updates
> >
> > Co-developed-by: Jesse Zhang <[email protected]>
> > Co-developed-by: Amber Lin <[email protected]>
> > Signed-off-by: Amber Lin <[email protected]>
> > Signed-off-by: Jesse Zhang <[email protected]>
> > Signed-off-by: Alex Deucher <[email protected]>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   9 ++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |   6 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   2 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   2 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c  |   2 +
> >  6 files changed, 160 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index de8c85dfc4c62..960d192076de8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -34,6 +34,7 @@
> >  #include "amdgpu_xcp.h"
> >  #include "amdgpu_xgmi.h"
> >  #include "amdgpu_mes.h"
> > +#include "mes_userqueue.h"
> >  #include "nvd.h"
> >
> >  /* delay 0.1 second to enable gfx off feature */ @@ -1976,15 +1977,25 @@ 
> > int
> > amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> >                              bool use_mmio)
> >  {
> >       struct amdgpu_device *adev = ring->adev;
> > +     bool reinit_queue;
> >       int r;
> >
> [Zhang, Jesse(Jie)]  The *_pipe_reset_enable` property is always 0. Should 
> set the flag ?

It should only be set when we do pipe reset in MES firmware.  For now,
driver is doing it via MMIO.

Alex

>
> > +     if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
> > +         adev->mes.compute_pipe_reset_enabled)
> > +             reinit_queue = true;
> > +     else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
> > +              adev->mes.gfx_pipe_reset_enabled)
> > +             reinit_queue = true;
> > +     else
> > +             reinit_queue = use_mmio;
> > +
> >       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> >
> >       r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 
> > 0);
> >       if (r)
> >               return r;
> >
> > -     if (use_mmio) {
> > +     if (reinit_queue) {
> >               r = amdgpu_mes_unmap_legacy_queue(adev, ring,
> >                                                 RESET_QUEUES, 0, 0, 0);
> >               if (r)
> > @@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device
> > *adev)
> >       }
> >  }
> >
> > +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device
> > *adev,
> > +                                               struct amdgpu_ring 
> > *guilty_ring) {
> > +     struct amdgpu_ring *ring;
> > +     int i;
> > +
> > +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > +             ring = &adev->gfx.compute_ring[i];
> > +             if (ring == guilty_ring)
> > +                     continue;
> > +             drm_sched_wqueue_start(&ring->sched);
> > +     }
> > +}
> > +
> > +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device
> > *adev,
> > +                                              struct amdgpu_ring 
> > *guilty_ring) {
> > +     struct amdgpu_ring *ring;
> > +     int i;
> > +
> > +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > +             ring = &adev->gfx.compute_ring[i];
> > +             if (ring == guilty_ring)
> > +                     continue;
> > +             drm_sched_wqueue_stop(&ring->sched);
> > +     }
> > +}
> > +
> > +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
> > +                                 struct amdgpu_ring *guilty_ring,
> > +                                 unsigned int db)
> > +{
> > +     bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> > +     struct amdgpu_fence *fence;
> > +     struct amdgpu_ring *ring;
> > +     int i, r;
> > +
> > +     for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > +             ring = &adev->gfx.compute_ring[i];
> > +             if (ring == guilty_ring)
> > +                     continue;
> > +             if (ring->doorbell_index == db) {
> > +                     fence = amdgpu_ring_find_guilty_fence(ring);
> > +                     r = amdgpu_gfx_mes_reset_queue(ring, 0, fence,
> > use_mmio);
> > +                     if (r)
> > +                             return r;
> > +                     break;
> > +             }
> > +     }
> > +     return 0;
> > +}
> > +
> > +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> > +                              struct amdgpu_ring *ring,
> > +                              struct amdgpu_fence *guilty_fence,
> > +                              struct amdgpu_usermode_queue *uq,
> > +                              unsigned int *hung_queue_count)
> > +{
> > +     struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
> > +             (struct amdgpu_mes_hung_queue_hqd_info *)
> > +             &adev->gfx.mec.mes_hung_db_array[adev-
> > >mes.hung_queue_hqd_info_offset];
> > +     int i, r, pipe, queue, queue_type;
> > +     unsigned int num_hung = 0;
> > +     bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> > +
> > +     guard(mutex)(&adev->gfx.mec.reset_mutex);
> > +     /* stop the drm schedulers for all compute queues */
> > +     amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
> > +     /* suspend all will determine which queues are hung.
> > +      * reset detect will return the array of bad queue doorbells
> > +      */
> > +     r = amdgpu_mes_suspend(adev, 0);
> > +     /* if suspend all success, it should no hang queue */
> > +     if (!r)
> > +             /* always reset the KCQ/userq since we need to signal the 
> > fence
> > +              * and we could be stuck in a loop which is preemptable.
> > +              */
> > +             goto fence_reset;
> > +     r = amdgpu_mes_detect_and_reset_hung_queues(adev,
> > AMDGPU_RING_TYPE_COMPUTE,
> > +                                                 true, &num_hung, adev-
> > >gfx.mec.mes_hung_db_array, 0);
> > +     if (r)
> > +             goto out;
> > +     if (hung_queue_count)
> > +             *hung_queue_count = num_hung;
> > +
> > +fence_reset:
> > +     /* reset the queue this came from if specified */
> > +     if (ring) {
> > +             r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, 
> > use_mmio);
> > +             if (r)
> > +                     goto out;
> > +     }
> > +     if (uq) {
> > +             r = mes_userq_reset(uq);
> > +             if (r)
> > +                     goto out;
> > +     }
> > +     for (i = 0; i < num_hung; i++) {
> > +             pipe = hqd_info[i].pipe_index;
> > +             queue = hqd_info[i].queue_index;
> > +             queue_type = hqd_info[i].queue_type;
> > +
> > +             /* reset any KCQs */
> > +             r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> > +                                          
> > adev->gfx.mec.mes_hung_db_array[i]);
> > +             if (r)
> > +                     goto out;
> > +             /* reset any KFD queues */
> > +             r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe,
> > queue,
> > +                                               adev-
> > >gfx.mec.mes_hung_db_array[i]);
> > +             if (r)
> > +                     goto out;
> > +             /* reset KGD user queues */
> > +             r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
> > +                                       adev->gfx.mec.mes_hung_db_array[i]);
> > +             if (r)
> > +                     goto out;
> > +     }
> > +out:
> > +     /* resume all will enable the non-hung queues */
> > +     amdgpu_mes_resume(adev, 0);
> > +     if (!r)
> > +             amdgpu_gfx_reset_start_compute_scheds(adev, ring);
> > +
> > +     return r;
> > +}
> > +
> >  int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
> >                                     unsigned int cleaner_shader_size)  { 
> > diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index f9175faa64ab7..8ef2ef394e9af 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -36,6 +36,8 @@
> >  #include "amdgpu_ring_mux.h"
> >  #include "amdgpu_xcp.h"
> >
> > +struct amdgpu_usermode_queue;
> > +
> >  /* GFX current status */
> >  #define AMDGPU_GFX_NORMAL_MODE                       0x00000000L
> >  #define AMDGPU_GFX_SAFE_MODE                 0x00000001L
> > @@ -117,6 +119,8 @@ struct amdgpu_mec {
> >       u32 num_queue_per_pipe;
> >       void                    *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS *
> > AMDGPU_MAX_GC_INSTANCES];
> >       bool use_mmio_for_reset;
> > +     u32 *mes_hung_db_array;
> > +     struct mutex            reset_mutex;
> >  };
> >
> >  struct amdgpu_mec_bitmap {
> > @@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct
> > amdgpu_device *adev,  bool amdgpu_gfx_is_master_xcc(struct amdgpu_device
> > *adev, int xcc_id);  int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev); 
> >  void
> > amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
> > +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> > +                              struct amdgpu_ring *ring,
> > +                              struct amdgpu_fence *guilty_fence,
> > +                              struct amdgpu_usermode_queue *uq,
> > +                              unsigned int *hung_queue_count);
> >  void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
> >               void *ras_error_status,
> >               void (*func)(struct amdgpu_device *adev, void 
> > *ras_error_status, diff
> > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > index 370e8d159b6fe..ec4d9a1e029a7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> >               }
> >       }
> >
> > +     adev->gfx.mec.mes_hung_db_array =
> > +             kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
> > +                     sizeof(u32), GFP_KERNEL);
> > +
> >       return 0;
> >
> >  error_doorbell:
> > @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> >       int i;
> >       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) :
> > 1;
> >
> > +     kfree(adev->gfx.mec.mes_hung_db_array);
> > +
> >       amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
> >                             &adev->mes.event_log_gpu_addr,
> >                             &adev->mes.event_log_cpu_addr); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 1a214c274ad02..32e01eb311c3b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> >       adev->gfx.me.use_mmio_for_reset = false;
> >       adev->gfx.mec.use_mmio_for_reset = true;
> >
> > +     mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> >       return 0;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > index 5beb0ae980d0b..247bcb7034e19 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > @@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> >       adev->gfx.me.use_mmio_for_reset = false;
> >       adev->gfx.mec.use_mmio_for_reset = true;
> >
> > +     mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> >       return 0;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > index 033f15e21ad33..7f8e43130bd28 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block
> > *ip_block)
> >       if (r)
> >               return r;
> >
> > +     mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> >       return 0;
> >  }
> >
> > --
> > 2.54.0
>

Reply via email to