From: Alex Deucher <[email protected]>

Add helpers to handle MES compute queue resets when multiple queues
are affected.  Can you be used by both KGD and KFD.

v2: sqaush in updates
v3: squash in userq updates

Co-developed-by: Jesse Zhang <[email protected]>
Co-developed-by: Amber Lin <[email protected]>
Signed-off-by: Amber Lin <[email protected]>
Signed-off-by: Jesse Zhang <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
Reviewed-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 140 +++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   9 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |   6 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   2 +
 drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c  |   2 +
 6 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index de8c85dfc4c6..960d192076de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -34,6 +34,7 @@
 #include "amdgpu_xcp.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_mes.h"
+#include "mes_userqueue.h"
 #include "nvd.h"
 
 /* delay 0.1 second to enable gfx off feature */
@@ -1976,15 +1977,25 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
                               bool use_mmio)
 {
        struct amdgpu_device *adev = ring->adev;
+       bool reinit_queue;
        int r;
 
+       if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
+           adev->mes.compute_pipe_reset_enabled)
+               reinit_queue = true;
+       else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
+                adev->mes.gfx_pipe_reset_enabled)
+               reinit_queue = true;
+       else
+               reinit_queue = use_mmio;
+
        amdgpu_ring_reset_helper_begin(ring, timedout_fence);
 
        r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
        if (r)
                return r;
 
-       if (use_mmio) {
+       if (reinit_queue) {
                r = amdgpu_mes_unmap_legacy_queue(adev, ring,
                                                  RESET_QUEUES, 0, 0, 0);
                if (r)
@@ -2159,6 +2170,133 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
        }
 }
 
+static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device *adev,
+                                                 struct amdgpu_ring 
*guilty_ring)
+{
+       struct amdgpu_ring *ring;
+       int i;
+
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               ring = &adev->gfx.compute_ring[i];
+               if (ring == guilty_ring)
+                       continue;
+               drm_sched_wqueue_start(&ring->sched);
+       }
+}
+
+static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev,
+                                                struct amdgpu_ring 
*guilty_ring)
+{
+       struct amdgpu_ring *ring;
+       int i;
+
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               ring = &adev->gfx.compute_ring[i];
+               if (ring == guilty_ring)
+                       continue;
+               drm_sched_wqueue_stop(&ring->sched);
+       }
+}
+
+static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
+                                   struct amdgpu_ring *guilty_ring,
+                                   unsigned int db)
+{
+       bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
+       struct amdgpu_fence *fence;
+       struct amdgpu_ring *ring;
+       int i, r;
+
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               ring = &adev->gfx.compute_ring[i];
+               if (ring == guilty_ring)
+                       continue;
+               if (ring->doorbell_index == db) {
+                       fence = amdgpu_ring_find_guilty_fence(ring);
+                       r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, 
use_mmio);
+                       if (r)
+                               return r;
+                       break;
+               }
+       }
+       return 0;
+}
+
+int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
+                                struct amdgpu_ring *ring,
+                                struct amdgpu_fence *guilty_fence,
+                                struct amdgpu_usermode_queue *uq,
+                                unsigned int *hung_queue_count)
+{
+       struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
+               (struct amdgpu_mes_hung_queue_hqd_info *)
+               
&adev->gfx.mec.mes_hung_db_array[adev->mes.hung_queue_hqd_info_offset];
+       int i, r, pipe, queue, queue_type;
+       unsigned int num_hung = 0;
+       bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
+
+       guard(mutex)(&adev->gfx.mec.reset_mutex);
+       /* stop the drm schedulers for all compute queues */
+       amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
+       /* suspend all will determine which queues are hung.
+        * reset detect will return the array of bad queue doorbells
+        */
+       r = amdgpu_mes_suspend(adev, 0);
+       /* if suspend all success, it should no hang queue */
+       if (!r)
+               /* always reset the KCQ/userq since we need to signal the fence
+                * and we could be stuck in a loop which is preemptable.
+                */
+               goto fence_reset;
+       r = amdgpu_mes_detect_and_reset_hung_queues(adev, 
AMDGPU_RING_TYPE_COMPUTE,
+                                                   true, &num_hung, 
adev->gfx.mec.mes_hung_db_array, 0);
+       if (r)
+               goto out;
+       if (hung_queue_count)
+               *hung_queue_count = num_hung;
+
+fence_reset:
+       /* reset the queue this came from if specified */
+       if (ring) {
+               r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio);
+               if (r)
+                       goto out;
+       }
+       if (uq) {
+               r = mes_userq_reset(uq);
+               if (r)
+                       goto out;
+       }
+       for (i = 0; i < num_hung; i++) {
+               pipe = hqd_info[i].pipe_index;
+               queue = hqd_info[i].queue_index;
+               queue_type = hqd_info[i].queue_type;
+
+               /* reset any KCQs */
+               r = amdgpu_gfx_reset_mes_kcq(adev, ring,
+                                            
adev->gfx.mec.mes_hung_db_array[i]);
+               if (r)
+                       goto out;
+               /* reset any KFD queues */
+               r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, 
queue,
+                                                 
adev->gfx.mec.mes_hung_db_array[i]);
+               if (r)
+                       goto out;
+               /* reset KGD user queues */
+               r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
+                                         adev->gfx.mec.mes_hung_db_array[i]);
+               if (r)
+                       goto out;
+       }
+out:
+       /* resume all will enable the non-hung queues */
+       amdgpu_mes_resume(adev, 0);
+       if (!r)
+               amdgpu_gfx_reset_start_compute_scheds(adev, ring);
+
+       return r;
+}
+
 int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
                                      unsigned int cleaner_shader_size)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f9175faa64ab..8ef2ef394e9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -36,6 +36,8 @@
 #include "amdgpu_ring_mux.h"
 #include "amdgpu_xcp.h"
 
+struct amdgpu_usermode_queue;
+
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE                 0x00000000L
 #define AMDGPU_GFX_SAFE_MODE                   0x00000001L
@@ -117,6 +119,8 @@ struct amdgpu_mec {
        u32 num_queue_per_pipe;
        void                    *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS * 
AMDGPU_MAX_GC_INSTANCES];
        bool use_mmio_for_reset;
+       u32 *mes_hung_db_array;
+       struct mutex            reset_mutex;
 };
 
 struct amdgpu_mec_bitmap {
@@ -642,6 +646,11 @@ int amdgpu_gfx_poison_consumption_handler(struct 
amdgpu_device *adev,
 bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);
 int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
 void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
+int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
+                                struct amdgpu_ring *ring,
+                                struct amdgpu_fence *guilty_fence,
+                                struct amdgpu_usermode_queue *uq,
+                                unsigned int *hung_queue_count);
 void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
                void *ras_error_status,
                void (*func)(struct amdgpu_device *adev, void *ras_error_status,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 370e8d159b6f..ec4d9a1e029a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
                }
        }
 
+       adev->gfx.mec.mes_hung_db_array =
+               kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
+                       sizeof(u32), GFP_KERNEL);
+
        return 0;
 
 error_doorbell:
@@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
        int i;
        int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
 
+       kfree(adev->gfx.mec.mes_hung_db_array);
+
        amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
                              &adev->mes.event_log_gpu_addr,
                              &adev->mes.event_log_cpu_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 1a214c274ad0..32e01eb311c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1908,6 +1908,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block 
*ip_block)
        adev->gfx.me.use_mmio_for_reset = false;
        adev->gfx.mec.use_mmio_for_reset = true;
 
+       mutex_init(&adev->gfx.mec.reset_mutex);
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 5beb0ae980d0..247bcb7034e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1606,6 +1606,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block 
*ip_block)
        adev->gfx.me.use_mmio_for_reset = false;
        adev->gfx.mec.use_mmio_for_reset = true;
 
+       mutex_init(&adev->gfx.mec.reset_mutex);
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
index 033f15e21ad3..7f8e43130bd2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
@@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block 
*ip_block)
        if (r)
                return r;
 
+       mutex_init(&adev->gfx.mec.reset_mutex);
+
        return 0;
 }
 
-- 
2.49.0

Reply via email to