On Mon, May 25, 2026 at 4:34 AM Zhang, Jesse(Jie) <[email protected]> wrote:
>
> AMD General
>
> > -----Original Message-----
> > From: amd-gfx <[email protected]> On Behalf Of Alex
> > Deucher
> > Sent: Friday, May 22, 2026 8:21 AM
> > To: [email protected]; Koenig, Christian
> > <[email protected]>; Khatri, Sunil <[email protected]>; Lin, Amber
> > <[email protected]>; Zhang, Jesse(Jie) <[email protected]>; Liu,
> > Shaoyun <[email protected]>
> > Cc: Deucher, Alexander <[email protected]>; Zhang, Jesse(Jie)
> > <[email protected]>
> > Subject: [PATCH 37/42] drm/amdgpu/gfx: add a common helper to handle MES
> > compute resets
> >
> > Add helpers to handle MES compute queue resets when multiple queues are
> > affected. Can you be used by both KGD and KFD.
> >
> > v2: sqaush in updates
> > v3: squash in userq updates
> >
> > Co-developed-by: Jesse Zhang <[email protected]>
> > Co-developed-by: Amber Lin <[email protected]>
> > Signed-off-by: Amber Lin <[email protected]>
> > Signed-off-by: Jesse Zhang <[email protected]>
> > Signed-off-by: Alex Deucher <[email protected]>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 9 ++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
> > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +
> > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +
> > drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 2 +
> > 6 files changed, 160 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index de8c85dfc4c62..960d192076de8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -34,6 +34,7 @@
> > #include "amdgpu_xcp.h"
> > #include "amdgpu_xgmi.h"
> > #include "amdgpu_mes.h"
> > +#include "mes_userqueue.h"
> > #include "nvd.h"
> >
> > /* delay 0.1 second to enable gfx off feature */ @@ -1976,15 +1977,25 @@
> > int
> > amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> > bool use_mmio)
> > {
> > struct amdgpu_device *adev = ring->adev;
> > + bool reinit_queue;
> > int r;
> >
> [Zhang, Jesse(Jie)] The *_pipe_reset_enable` property is always 0. Should
> set the flag ?
It should only be set when we do pipe reset in MES firmware. For now,
driver is doing it via MMIO.
Alex
>
> > + if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
> > + adev->mes.compute_pipe_reset_enabled)
> > + reinit_queue = true;
> > + else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
> > + adev->mes.gfx_pipe_reset_enabled)
> > + reinit_queue = true;
> > + else
> > + reinit_queue = use_mmio;
> > +
> > amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> >
> > r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio,
> > 0);
> > if (r)
> > return r;
> >
> > - if (use_mmio) {
> > + if (reinit_queue) {
> > r = amdgpu_mes_unmap_legacy_queue(adev, ring,
> > RESET_QUEUES, 0, 0, 0);
> > if (r)
> > @@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device
> > *adev)
> > }
> > }
> >
> > +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device
> > *adev,
> > + struct amdgpu_ring
> > *guilty_ring) {
> > + struct amdgpu_ring *ring;
> > + int i;
> > +
> > + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > + ring = &adev->gfx.compute_ring[i];
> > + if (ring == guilty_ring)
> > + continue;
> > + drm_sched_wqueue_start(&ring->sched);
> > + }
> > +}
> > +
> > +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device
> > *adev,
> > + struct amdgpu_ring
> > *guilty_ring) {
> > + struct amdgpu_ring *ring;
> > + int i;
> > +
> > + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > + ring = &adev->gfx.compute_ring[i];
> > + if (ring == guilty_ring)
> > + continue;
> > + drm_sched_wqueue_stop(&ring->sched);
> > + }
> > +}
> > +
> > +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
> > + struct amdgpu_ring *guilty_ring,
> > + unsigned int db)
> > +{
> > + bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> > + struct amdgpu_fence *fence;
> > + struct amdgpu_ring *ring;
> > + int i, r;
> > +
> > + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > + ring = &adev->gfx.compute_ring[i];
> > + if (ring == guilty_ring)
> > + continue;
> > + if (ring->doorbell_index == db) {
> > + fence = amdgpu_ring_find_guilty_fence(ring);
> > + r = amdgpu_gfx_mes_reset_queue(ring, 0, fence,
> > use_mmio);
> > + if (r)
> > + return r;
> > + break;
> > + }
> > + }
> > + return 0;
> > +}
> > +
> > +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> > + struct amdgpu_ring *ring,
> > + struct amdgpu_fence *guilty_fence,
> > + struct amdgpu_usermode_queue *uq,
> > + unsigned int *hung_queue_count)
> > +{
> > + struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
> > + (struct amdgpu_mes_hung_queue_hqd_info *)
> > + &adev->gfx.mec.mes_hung_db_array[adev-
> > >mes.hung_queue_hqd_info_offset];
> > + int i, r, pipe, queue, queue_type;
> > + unsigned int num_hung = 0;
> > + bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> > +
> > + guard(mutex)(&adev->gfx.mec.reset_mutex);
> > + /* stop the drm schedulers for all compute queues */
> > + amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
> > + /* suspend all will determine which queues are hung.
> > + * reset detect will return the array of bad queue doorbells
> > + */
> > + r = amdgpu_mes_suspend(adev, 0);
> > + /* if suspend all success, it should no hang queue */
> > + if (!r)
> > + /* always reset the KCQ/userq since we need to signal the
> > fence
> > + * and we could be stuck in a loop which is preemptable.
> > + */
> > + goto fence_reset;
> > + r = amdgpu_mes_detect_and_reset_hung_queues(adev,
> > AMDGPU_RING_TYPE_COMPUTE,
> > + true, &num_hung, adev-
> > >gfx.mec.mes_hung_db_array, 0);
> > + if (r)
> > + goto out;
> > + if (hung_queue_count)
> > + *hung_queue_count = num_hung;
> > +
> > +fence_reset:
> > + /* reset the queue this came from if specified */
> > + if (ring) {
> > + r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence,
> > use_mmio);
> > + if (r)
> > + goto out;
> > + }
> > + if (uq) {
> > + r = mes_userq_reset(uq);
> > + if (r)
> > + goto out;
> > + }
> > + for (i = 0; i < num_hung; i++) {
> > + pipe = hqd_info[i].pipe_index;
> > + queue = hqd_info[i].queue_index;
> > + queue_type = hqd_info[i].queue_type;
> > +
> > + /* reset any KCQs */
> > + r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> > +
> > adev->gfx.mec.mes_hung_db_array[i]);
> > + if (r)
> > + goto out;
> > + /* reset any KFD queues */
> > + r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe,
> > queue,
> > + adev-
> > >gfx.mec.mes_hung_db_array[i]);
> > + if (r)
> > + goto out;
> > + /* reset KGD user queues */
> > + r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
> > + adev->gfx.mec.mes_hung_db_array[i]);
> > + if (r)
> > + goto out;
> > + }
> > +out:
> > + /* resume all will enable the non-hung queues */
> > + amdgpu_mes_resume(adev, 0);
> > + if (!r)
> > + amdgpu_gfx_reset_start_compute_scheds(adev, ring);
> > +
> > + return r;
> > +}
> > +
> > int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
> > unsigned int cleaner_shader_size) {
> > diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index f9175faa64ab7..8ef2ef394e9af 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -36,6 +36,8 @@
> > #include "amdgpu_ring_mux.h"
> > #include "amdgpu_xcp.h"
> >
> > +struct amdgpu_usermode_queue;
> > +
> > /* GFX current status */
> > #define AMDGPU_GFX_NORMAL_MODE 0x00000000L
> > #define AMDGPU_GFX_SAFE_MODE 0x00000001L
> > @@ -117,6 +119,8 @@ struct amdgpu_mec {
> > u32 num_queue_per_pipe;
> > void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS *
> > AMDGPU_MAX_GC_INSTANCES];
> > bool use_mmio_for_reset;
> > + u32 *mes_hung_db_array;
> > + struct mutex reset_mutex;
> > };
> >
> > struct amdgpu_mec_bitmap {
> > @@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct
> > amdgpu_device *adev, bool amdgpu_gfx_is_master_xcc(struct amdgpu_device
> > *adev, int xcc_id); int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
> > void
> > amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
> > +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> > + struct amdgpu_ring *ring,
> > + struct amdgpu_fence *guilty_fence,
> > + struct amdgpu_usermode_queue *uq,
> > + unsigned int *hung_queue_count);
> > void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
> > void *ras_error_status,
> > void (*func)(struct amdgpu_device *adev, void
> > *ras_error_status, diff
> > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > index 370e8d159b6fe..ec4d9a1e029a7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> > }
> > }
> >
> > + adev->gfx.mec.mes_hung_db_array =
> > + kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
> > + sizeof(u32), GFP_KERNEL);
> > +
> > return 0;
> >
> > error_doorbell:
> > @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> > int i;
> > int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) :
> > 1;
> >
> > + kfree(adev->gfx.mec.mes_hung_db_array);
> > +
> > amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
> > &adev->mes.event_log_gpu_addr,
> > &adev->mes.event_log_cpu_addr); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 1a214c274ad02..32e01eb311c3b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > adev->gfx.me.use_mmio_for_reset = false;
> > adev->gfx.mec.use_mmio_for_reset = true;
> >
> > + mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> > return 0;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > index 5beb0ae980d0b..247bcb7034e19 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > @@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > adev->gfx.me.use_mmio_for_reset = false;
> > adev->gfx.mec.use_mmio_for_reset = true;
> >
> > + mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> > return 0;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > index 033f15e21ad33..7f8e43130bd28 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> > @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block
> > *ip_block)
> > if (r)
> > return r;
> >
> > + mutex_init(&adev->gfx.mec.reset_mutex);
> > +
> > return 0;
> > }
> >
> > --
> > 2.54.0
>