AMD General
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Alex
> Deucher
> Sent: Friday, May 22, 2026 8:21 AM
> To: [email protected]; Koenig, Christian
> <[email protected]>; Khatri, Sunil <[email protected]>; Lin, Amber
> <[email protected]>; Zhang, Jesse(Jie) <[email protected]>; Liu,
> Shaoyun <[email protected]>
> Cc: Deucher, Alexander <[email protected]>; Zhang, Jesse(Jie)
> <[email protected]>
> Subject: [PATCH 37/42] drm/amdgpu/gfx: add a common helper to handle MES
> compute resets
>
> Add helpers to handle MES compute queue resets when multiple queues are
> affected. Can you be used by both KGD and KFD.
>
> v2: sqaush in updates
> v3: squash in userq updates
>
> Co-developed-by: Jesse Zhang <[email protected]>
> Co-developed-by: Amber Lin <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> Signed-off-by: Alex Deucher <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 9 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +
> drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 2 +
> 6 files changed, 160 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index de8c85dfc4c62..960d192076de8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -34,6 +34,7 @@
> #include "amdgpu_xcp.h"
> #include "amdgpu_xgmi.h"
> #include "amdgpu_mes.h"
> +#include "mes_userqueue.h"
> #include "nvd.h"
>
> /* delay 0.1 second to enable gfx off feature */ @@ -1976,15 +1977,25 @@ int
> amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> bool use_mmio)
> {
> struct amdgpu_device *adev = ring->adev;
> + bool reinit_queue;
> int r;
>
[Zhang, Jesse(Jie)] The *_pipe_reset_enable` property is always 0. Should set
the flag ?
> + if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
> + adev->mes.compute_pipe_reset_enabled)
> + reinit_queue = true;
> + else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
> + adev->mes.gfx_pipe_reset_enabled)
> + reinit_queue = true;
> + else
> + reinit_queue = use_mmio;
> +
> amdgpu_ring_reset_helper_begin(ring, timedout_fence);
>
> r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
> if (r)
> return r;
>
> - if (use_mmio) {
> + if (reinit_queue) {
> r = amdgpu_mes_unmap_legacy_queue(adev, ring,
> RESET_QUEUES, 0, 0, 0);
> if (r)
> @@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device
> *adev)
> }
> }
>
> +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device
> *adev,
> + struct amdgpu_ring
> *guilty_ring) {
> + struct amdgpu_ring *ring;
> + int i;
> +
> + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> + ring = &adev->gfx.compute_ring[i];
> + if (ring == guilty_ring)
> + continue;
> + drm_sched_wqueue_start(&ring->sched);
> + }
> +}
> +
> +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device
> *adev,
> + struct amdgpu_ring
> *guilty_ring) {
> + struct amdgpu_ring *ring;
> + int i;
> +
> + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> + ring = &adev->gfx.compute_ring[i];
> + if (ring == guilty_ring)
> + continue;
> + drm_sched_wqueue_stop(&ring->sched);
> + }
> +}
> +
> +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty_ring,
> + unsigned int db)
> +{
> + bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> + struct amdgpu_fence *fence;
> + struct amdgpu_ring *ring;
> + int i, r;
> +
> + for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> + ring = &adev->gfx.compute_ring[i];
> + if (ring == guilty_ring)
> + continue;
> + if (ring->doorbell_index == db) {
> + fence = amdgpu_ring_find_guilty_fence(ring);
> + r = amdgpu_gfx_mes_reset_queue(ring, 0, fence,
> use_mmio);
> + if (r)
> + return r;
> + break;
> + }
> + }
> + return 0;
> +}
> +
> +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> + struct amdgpu_ring *ring,
> + struct amdgpu_fence *guilty_fence,
> + struct amdgpu_usermode_queue *uq,
> + unsigned int *hung_queue_count)
> +{
> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
> + (struct amdgpu_mes_hung_queue_hqd_info *)
> + &adev->gfx.mec.mes_hung_db_array[adev-
> >mes.hung_queue_hqd_info_offset];
> + int i, r, pipe, queue, queue_type;
> + unsigned int num_hung = 0;
> + bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
> +
> + guard(mutex)(&adev->gfx.mec.reset_mutex);
> + /* stop the drm schedulers for all compute queues */
> + amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
> + /* suspend all will determine which queues are hung.
> + * reset detect will return the array of bad queue doorbells
> + */
> + r = amdgpu_mes_suspend(adev, 0);
> + /* if suspend all success, it should no hang queue */
> + if (!r)
> + /* always reset the KCQ/userq since we need to signal the fence
> + * and we could be stuck in a loop which is preemptable.
> + */
> + goto fence_reset;
> + r = amdgpu_mes_detect_and_reset_hung_queues(adev,
> AMDGPU_RING_TYPE_COMPUTE,
> + true, &num_hung, adev-
> >gfx.mec.mes_hung_db_array, 0);
> + if (r)
> + goto out;
> + if (hung_queue_count)
> + *hung_queue_count = num_hung;
> +
> +fence_reset:
> + /* reset the queue this came from if specified */
> + if (ring) {
> + r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio);
> + if (r)
> + goto out;
> + }
> + if (uq) {
> + r = mes_userq_reset(uq);
> + if (r)
> + goto out;
> + }
> + for (i = 0; i < num_hung; i++) {
> + pipe = hqd_info[i].pipe_index;
> + queue = hqd_info[i].queue_index;
> + queue_type = hqd_info[i].queue_type;
> +
> + /* reset any KCQs */
> + r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> +
> adev->gfx.mec.mes_hung_db_array[i]);
> + if (r)
> + goto out;
> + /* reset any KFD queues */
> + r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe,
> queue,
> + adev-
> >gfx.mec.mes_hung_db_array[i]);
> + if (r)
> + goto out;
> + /* reset KGD user queues */
> + r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
> + adev->gfx.mec.mes_hung_db_array[i]);
> + if (r)
> + goto out;
> + }
> +out:
> + /* resume all will enable the non-hung queues */
> + amdgpu_mes_resume(adev, 0);
> + if (!r)
> + amdgpu_gfx_reset_start_compute_scheds(adev, ring);
> +
> + return r;
> +}
> +
> int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
> unsigned int cleaner_shader_size) { diff
> --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f9175faa64ab7..8ef2ef394e9af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -36,6 +36,8 @@
> #include "amdgpu_ring_mux.h"
> #include "amdgpu_xcp.h"
>
> +struct amdgpu_usermode_queue;
> +
> /* GFX current status */
> #define AMDGPU_GFX_NORMAL_MODE 0x00000000L
> #define AMDGPU_GFX_SAFE_MODE 0x00000001L
> @@ -117,6 +119,8 @@ struct amdgpu_mec {
> u32 num_queue_per_pipe;
> void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS *
> AMDGPU_MAX_GC_INSTANCES];
> bool use_mmio_for_reset;
> + u32 *mes_hung_db_array;
> + struct mutex reset_mutex;
> };
>
> struct amdgpu_mec_bitmap {
> @@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct
> amdgpu_device *adev, bool amdgpu_gfx_is_master_xcc(struct amdgpu_device
> *adev, int xcc_id); int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
> void
> amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
> +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
> + struct amdgpu_ring *ring,
> + struct amdgpu_fence *guilty_fence,
> + struct amdgpu_usermode_queue *uq,
> + unsigned int *hung_queue_count);
> void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
> void *ras_error_status,
> void (*func)(struct amdgpu_device *adev, void
> *ras_error_status, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 370e8d159b6fe..ec4d9a1e029a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> }
> }
>
> + adev->gfx.mec.mes_hung_db_array =
> + kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
> + sizeof(u32), GFP_KERNEL);
> +
> return 0;
>
> error_doorbell:
> @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> int i;
> int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) :
> 1;
>
> + kfree(adev->gfx.mec.mes_hung_db_array);
> +
> amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
> &adev->mes.event_log_gpu_addr,
> &adev->mes.event_log_cpu_addr); diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 1a214c274ad02..32e01eb311c3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> adev->gfx.me.use_mmio_for_reset = false;
> adev->gfx.mec.use_mmio_for_reset = true;
>
> + mutex_init(&adev->gfx.mec.reset_mutex);
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 5beb0ae980d0b..247bcb7034e19 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> adev->gfx.me.use_mmio_for_reset = false;
> adev->gfx.mec.use_mmio_for_reset = true;
>
> + mutex_init(&adev->gfx.mec.reset_mutex);
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> index 033f15e21ad33..7f8e43130bd28 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block
> *ip_block)
> if (r)
> return r;
>
> + mutex_init(&adev->gfx.mec.reset_mutex);
> +
> return 0;
> }
>
> --
> 2.54.0