From: Alex Deucher <[email protected]> When we reset MES queues we need to coordinate across KGD and KFD. Use a single function to handle the queue resets across KFD and KGD.
v2: squash in fixes for userqs Co-developed-by: Jesse Zhang <[email protected]> Co-developed-by: Amber Lin <[email protected]> Signed-off-by: Amber Lin <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]> Reviewed-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 7 +- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 2 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 75 ++++--------------- 5 files changed, 22 insertions(+), 68 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 0c9d370341a9..94cceafce51e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -138,7 +138,12 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work) if (amdgpu_userq_is_reset_type_supported(adev, queue->queue_type, AMDGPU_RESET_TYPE_PER_QUEUE)) { - int r = userq_funcs->reset(queue); + int r; + + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, NULL, NULL); + else + r = userq_funcs->reset(queue); if (r) gpu_reset = true; } else { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 32e01eb311c3..d707ca106823 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6795,9 +6795,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = adev->gfx.mec.use_mmio_for_reset; - return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL); } static void gfx_v11_ip_print(struct amdgpu_ip_block *ip_block, struct drm_printer *p) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 247bcb7034e1..82b362f3651a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -5231,9 +5231,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = adev->gfx.mec.use_mmio_for_reset; - return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL); } static void gfx_v12_0_ring_begin_use(struct amdgpu_ring *ring) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 5e6a146109fc..0ce59fcb411b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -205,7 +205,7 @@ int mes_userq_reset_queue(struct amdgpu_device *adev, unsigned int db) { struct amdgpu_usermode_queue *uq; - bool use_mmio = false; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; unsigned long uq_id; int r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index babf53b7b1bb..885bf9840784 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -412,7 +412,7 @@ static int reset_queue_mes(struct device_queue_manager *dqm, struct queue *q, { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; struct kfd_process_device *pdd; - bool use_mmio = false; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; int r; pdd = kfd_get_process_device_data(q->device, q->process); @@ -447,11 +447,8 @@ int kfd_reset_queue_mes(struct device_queue_manager *dqm, int queue_type, static int reset_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; - int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; - int num_hung = 0, r = 0, i, pipe, queue, queue_type; - u32 *hung_array = dqm->hung_db_array; - struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info; - struct queue *q; + unsigned int num_hung = 0; + int r = 0; if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) { r = -ENOTRECOVERABLE; @@ -467,51 +464,9 @@ static int reset_queues_mes(struct device_queue_manager *dqm) goto fail; } - if (!hung_array || !hqd_info) { - r = -ENOMEM; - goto fail; - } - - memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info)); - - /* - * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called - * post suspend_all as reset & detect will return all hung queue types. - * - * Passed parameter is for targeting queues not scheduled by MES add_queue. - */ - r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, - true, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); - - if (!num_hung || r) { - r = -ENOTRECOVERABLE; + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, NULL, &num_hung); + if (r) goto fail; - } - - /* MES resets queue/pipe and cleans up internally */ - for (i = 0; i < num_hung; i++) { - hqd_info[i].bit0_31 = hung_array[i + hqd_info_size]; - pipe = hqd_info[i].pipe_index; - queue = hqd_info[i].queue_index; - queue_type = hqd_info[i].queue_type; - - if (queue_type != MES_QUEUE_TYPE_COMPUTE && - queue_type != MES_QUEUE_TYPE_SDMA) { - pr_warn("Unsupported hung queue reset type: %d\n", queue_type); - hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET; - continue; - } - - q = find_queue_by_doorbell_offset(dqm, hung_array[i]); - /* skip queues not owned by KFD */ - if (!q) { - continue; - } else { - r = reset_queue_mes(dqm, q, queue_type, pipe, queue, hung_array[i]); - if (r) - goto fail; - } - } dqm->detect_hang_count = num_hung; kfd_signal_reset_event(dqm->dev); @@ -529,22 +484,18 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; - r = amdgpu_mes_suspend(adev, ffs(dqm->dev->xcc_mask) - 1); - up_read(&adev->reset_domain->sem); - - if (r) { - if (!reset_queues_mes(dqm)) { - r = 0; - goto out; - } - dev_err(adev->dev, "failed to suspend gangs from MES\n"); - dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); - kfd_hws_hang(dqm); + if (!reset_queues_mes(dqm)) { + r = 0; + goto out; } + + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); out: - resume_all_queues_mes(dqm); + up_read(&adev->reset_domain->sem); return r; } -- 2.49.0
