From: Alex Deucher <[email protected]> Add helpers to handle MES compute queue resets when multiple queues are affected. Can you be used by both KGD and KFD.
v2: sqaush in updates v3: squash in userq updates Co-developed-by: Jesse Zhang <[email protected]> Co-developed-by: Amber Lin <[email protected]> Signed-off-by: Amber Lin <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]> Reviewed-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 9 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 + drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 + drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 + drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 2 + 6 files changed, 160 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index de8c85dfc4c6..960d192076de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -34,6 +34,7 @@ #include "amdgpu_xcp.h" #include "amdgpu_xgmi.h" #include "amdgpu_mes.h" +#include "mes_userqueue.h" #include "nvd.h" /* delay 0.1 second to enable gfx off feature */ @@ -1976,15 +1977,25 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, bool use_mmio) { struct amdgpu_device *adev = ring->adev; + bool reinit_queue; int r; + if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) && + adev->mes.compute_pipe_reset_enabled) + reinit_queue = true; + else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) && + adev->mes.gfx_pipe_reset_enabled) + reinit_queue = true; + else + reinit_queue = use_mmio; + amdgpu_ring_reset_helper_begin(ring, timedout_fence); r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0); if (r) return r; - if (use_mmio) { + if (reinit_queue) { r = amdgpu_mes_unmap_legacy_queue(adev, ring, RESET_QUEUES, 0, 0, 0); if (r) @@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) } } +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring) +{ + struct amdgpu_ring *ring; + int i; + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + drm_sched_wqueue_start(&ring->sched); + } +} + +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring) +{ + struct amdgpu_ring *ring; + int i; + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + drm_sched_wqueue_stop(&ring->sched); + } +} + +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring, + unsigned int db) +{ + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; + struct amdgpu_fence *fence; + struct amdgpu_ring *ring; + int i, r; + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + if (ring->doorbell_index == db) { + fence = amdgpu_ring_find_guilty_fence(ring); + r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, use_mmio); + if (r) + return r; + break; + } + } + return 0; +} + +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + struct amdgpu_fence *guilty_fence, + struct amdgpu_usermode_queue *uq, + unsigned int *hung_queue_count) +{ + struct amdgpu_mes_hung_queue_hqd_info *hqd_info = + (struct amdgpu_mes_hung_queue_hqd_info *) + &adev->gfx.mec.mes_hung_db_array[adev->mes.hung_queue_hqd_info_offset]; + int i, r, pipe, queue, queue_type; + unsigned int num_hung = 0; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; + + guard(mutex)(&adev->gfx.mec.reset_mutex); + /* stop the drm schedulers for all compute queues */ + amdgpu_gfx_reset_stop_compute_scheds(adev, ring); + /* suspend all will determine which queues are hung. + * reset detect will return the array of bad queue doorbells + */ + r = amdgpu_mes_suspend(adev, 0); + /* if suspend all success, it should no hang queue */ + if (!r) + /* always reset the KCQ/userq since we need to signal the fence + * and we could be stuck in a loop which is preemptable. + */ + goto fence_reset; + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, + true, &num_hung, adev->gfx.mec.mes_hung_db_array, 0); + if (r) + goto out; + if (hung_queue_count) + *hung_queue_count = num_hung; + +fence_reset: + /* reset the queue this came from if specified */ + if (ring) { + r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio); + if (r) + goto out; + } + if (uq) { + r = mes_userq_reset(uq); + if (r) + goto out; + } + for (i = 0; i < num_hung; i++) { + pipe = hqd_info[i].pipe_index; + queue = hqd_info[i].queue_index; + queue_type = hqd_info[i].queue_type; + + /* reset any KCQs */ + r = amdgpu_gfx_reset_mes_kcq(adev, ring, + adev->gfx.mec.mes_hung_db_array[i]); + if (r) + goto out; + /* reset any KFD queues */ + r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, queue, + adev->gfx.mec.mes_hung_db_array[i]); + if (r) + goto out; + /* reset KGD user queues */ + r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue, + adev->gfx.mec.mes_hung_db_array[i]); + if (r) + goto out; + } +out: + /* resume all will enable the non-hung queues */ + amdgpu_mes_resume(adev, 0); + if (!r) + amdgpu_gfx_reset_start_compute_scheds(adev, ring); + + return r; +} + int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, unsigned int cleaner_shader_size) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index f9175faa64ab..8ef2ef394e9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -36,6 +36,8 @@ #include "amdgpu_ring_mux.h" #include "amdgpu_xcp.h" +struct amdgpu_usermode_queue; + /* GFX current status */ #define AMDGPU_GFX_NORMAL_MODE 0x00000000L #define AMDGPU_GFX_SAFE_MODE 0x00000001L @@ -117,6 +119,8 @@ struct amdgpu_mec { u32 num_queue_per_pipe; void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; bool use_mmio_for_reset; + u32 *mes_hung_db_array; + struct mutex reset_mutex; }; struct amdgpu_mec_bitmap { @@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id); int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev); void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev); +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + struct amdgpu_fence *guilty_fence, + struct amdgpu_usermode_queue *uq, + unsigned int *hung_queue_count); void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev, void *ras_error_status, void (*func)(struct amdgpu_device *adev, void *ras_error_status, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 370e8d159b6f..ec4d9a1e029a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev) } } + adev->gfx.mec.mes_hung_db_array = + kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev), + sizeof(u32), GFP_KERNEL); + return 0; error_doorbell: @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) int i; int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + kfree(adev->gfx.mec.mes_hung_db_array); + amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1a214c274ad0..32e01eb311c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->gfx.me.use_mmio_for_reset = false; adev->gfx.mec.use_mmio_for_reset = true; + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 5beb0ae980d0..247bcb7034e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) adev->gfx.me.use_mmio_for_reset = false; adev->gfx.mec.use_mmio_for_reset = true; + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c index 033f15e21ad3..7f8e43130bd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } -- 2.49.0
