[AMD Official Use Only - AMD Internal Distribution Only]

One comment below. With that addressed
Reviewed-by: Harish Kasiviswanathan <[email protected]>


-----Original Message-----
From: Joshi, Mukul <[email protected]>
Sent: Friday, September 20, 2024 5:07 PM
To: [email protected]
Cc: Kasiviswanathan, Harish <[email protected]>; Joshi, Mukul 
<[email protected]>
Subject: [PATCH 2/2] drm/amdkfd: Fix CU occupancy for GFX 9.4.3

Make CU occupancy calculations work on GFX 9.4.3 by
updating the logic to handle multiple XCCs correctly.

Signed-off-by: Mukul Joshi <[email protected]>
---
v1->v2:
- Break into 2 patches, one for the generic change
  and the other for GFX v9.4.3.
- Incorporate Harish's comments.

 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 12 ++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_process.c          | 10 +++++++++-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index fe8a8e7e9a9a..e6bc808d9c59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, 
int queue_idx,
         */
        pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
        queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
-       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
-       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
+       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
+       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
                                  mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
        wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
        if (wave_cnt != 0) {
                queue_cnt->wave_cnt += wave_cnt;
                queue_cnt->doorbell_off =
-                       (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) &
+                       (RREG32_SOC15(GC, GET_INST(GC, inst), 
mmCP_HQD_PQ_DOORBELL_CONTROL) &
                         CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
                         CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
        }
@@ -1034,7 +1034,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device 
*adev,
        DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);

        lock_spi_csq_mutexes(adev);
-       soc15_grbm_select(adev, 1, 0, 0, 0, inst);
+       soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));

        /*
         * Iterate through the shader engines and arrays of the device
@@ -1047,7 +1047,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device 
*adev,
        se_cnt = adev->gfx.config.max_shader_engines;
        for (se_idx = 0; se_idx < se_cnt; se_idx++) {
                amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
-               queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+               queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), 
mmSPI_CSQ_WF_ACTIVE_STATUS);

                /*
                 * Assumption: queue map encodes following schema: four
@@ -1072,7 +1072,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device 
*adev,
        }

        amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
-       soc15_grbm_select(adev, 0, 0, 0, 0, inst);
+       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
        unlock_spi_csq_mutexes(adev);

        /* Update the output parameters and return */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d73841268c9b..831e0e92bd23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -292,8 +292,13 @@ static int kfd_get_cu_occupancy(struct attribute *attr, 
char *buffer)
        wave_cnt = 0;
        max_waves_per_cu = 0;

+       /*
+        * For GFX9.4.3, fetch the CU occupancy from the first XCC in the 
partition.
+        * Later we multiply the wave count by number of XCCs in the partition 
to get
+        * the total wave counts across all XCCs in the partition.
+        */

Could you please elaborate little bit on this comment? So, here the assumption 
is that it is AQL queue with cooprative dispatch. The values will not be 
accurate for PM4 user queues.

        dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
-                       &max_waves_per_cu, 0);
+                       &max_waves_per_cu, ffs(dev->xcc_mask) - 1);

        for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
                if (cu_occupancy[i].wave_cnt != 0 &&
@@ -302,6 +307,9 @@ static int kfd_get_cu_occupancy(struct attribute *attr, 
char *buffer)
                        wave_cnt += cu_occupancy[i].wave_cnt;
        }

+       /* Update wave_cnt for the number of XCCs in the partition */
+       wave_cnt *= NUM_XCC(dev->xcc_mask);
+
        /* Translate wave count to number of compute units */
        cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
        return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
--
2.35.1

Reply via email to