When removing queues fails, KFD calls amdgpu_mes to detect and reset
hung queues, then cleans up those hung queues in KFD.
Suggested-by: Jonathan Kim<[email protected]>
Signed-off-by: Amber Lin<[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
4 files changed, 153 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f3a4ae1fd521..7cf4b3d6fc93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct
amdgpu_device *adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
}
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
+{
+ return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
+ (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
+}
+
/* Fix me -- node_id is used to identify the correct MES instances in the
future */
static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
uint32_t node_id, bool enable)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 643b4f8d757a..44fa4d73bce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
}
bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ec8d7f4be840..1c9c350bfffe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager
*dqm,
struct queue *q, const uint32_t
*restore_sdma_id);
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool
is_sdma);
+static int resume_all_queues_mes(struct device_queue_manager *dqm);
+static int suspend_all_queues_mes(struct device_queue_manager *dqm);
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager
*dqm,
+ uint32_t doorbell_offset);
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue
*q,
+ struct qcm_process_device *qpd);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager
*dqm, struct queue *q,
return r;
}
-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
- struct qcm_process_device *qpd)
+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm,
struct queue *q,
+ struct qcm_process_device *qpd,
+ bool is_for_reset,
+ bool flush_mes_queue)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r;
struct mes_remove_queue_input queue_input;
+ /* queue was already removed during reset */
+ if (q->properties.is_reset)
+ return 0;
+
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
@@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager
*dqm, struct queue *q,
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+ queue_input.remove_queue_after_reset = flush_mes_queue;
queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
@@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager
*dqm, struct queue *q,
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
+ if (is_for_reset)
+ return r;
+
if (r) {
+ if (!suspend_all_queues_mes(dqm))
+ return resume_all_queues_mes(dqm);
+
dev_err(adev->dev, "failed to remove hardware queue from MES,
doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a
GPU reset\n");
@@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager
*dqm, struct queue *q,
return r;
}
+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
+}
+
static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
@@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct
device_queue_manager *dqm)
return retval;
}
+static int reset_queues_mes(struct device_queue_manager *dqm)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
+ int num_hung = 0, r = 0, i, pipe, queue, queue_type;
+ uint32_t *hung_array;
+ struct kfd_process_device *pdd;
+ struct queue *q;
+
+ if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* reset should be used only in dqm locked queue reset */
+ if (WARN_ON(dqm->detect_hang_count > 0))
+ return 0;
+
+ if (!amdgpu_gpu_recovery) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ hung_array = kzalloc(adev->mes.hung_queue_db_array_size *
sizeof(uint32_t), GFP_KERNEL);
+ if (!hung_array) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ hqd_info = kzalloc(hqd_info_size * sizeof(struct
amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);