[PATCH] drm/amdgpu: Fix CP_MEC_MDBASE in multi-xcc for gfx v12_1

Alex Deucher Tue, 09 Dec 2025 23:14:48 -0800

From: Michael Chen <[email protected]>

Need to allocate memory for MEC FW data and program
registers CP_MEC_MDBASE for each XCC respectively.


Signed-off-by: Michael Chen <[email protected]>
Acked-by: Harish Kasiviswanathan <[email protected]>
Reviewed-by: Shaoyun.liu <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 192 +++++++++++++------------
 1 file changed, 98 insertions(+), 94 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
index 61ffba9a252bd..347912596a1b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
@@ -1887,20 +1887,18 @@ static void gfx_v12_1_xcc_cp_compute_enable(struct 
amdgpu_device *adev,
 }
 
 static int gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device 
*adev,
-                                                       int xcc_id)
+                                                       uint16_t xcc_mask)
 {
        const struct gfx_firmware_header_v2_0 *mec_hdr;
        const __le32 *fw_ucode, *fw_data;
        u32 tmp, fw_ucode_size, fw_data_size;
        u32 i, usec_timeout = 50000; /* Wait for 50 ms */
        u32 *fw_ucode_ptr, *fw_data_ptr;
-       int r;
+       int r, xcc_id;
 
        if (!adev->gfx.mec_fw)
                return -EINVAL;
 
-       gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
-
        mec_hdr = (const struct gfx_firmware_header_v2_0 
*)adev->gfx.mec_fw->data;
        amdgpu_ucode_print_gfx_hdr(&mec_hdr->header);
 
@@ -1925,7 +1923,7 @@ static int 
gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device *ad
 
        r = amdgpu_bo_create_reserved(adev,
                                      ALIGN(fw_data_size, 64 * 1024) *
-                                     adev->gfx.mec.num_pipe_per_mec,
+                                     adev->gfx.mec.num_pipe_per_mec * 
NUM_XCC(xcc_mask),
                                      64 * 1024, AMDGPU_GEM_DOMAIN_VRAM,
                                      &adev->gfx.mec.mec_fw_data_obj,
                                      &adev->gfx.mec.mec_fw_data_gpu_addr,
@@ -1937,8 +1935,12 @@ static int 
gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device *ad
        }
 
        memcpy(fw_ucode_ptr, fw_ucode, fw_ucode_size);
-       for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
-               memcpy(fw_data_ptr + i * ALIGN(fw_data_size, 64 * 1024) / 4, 
fw_data, fw_data_size);
+       for_each_inst(xcc_id, xcc_mask) {
+               for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
+                       u32 offset = (xcc_id * adev->gfx.mec.num_pipe_per_mec + 
i) *
+                                    ALIGN(fw_data_size, 64 * 1024) / 4;
+                       memcpy(fw_data_ptr + offset, fw_data, fw_data_size);
+               }
        }
 
        amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_obj);
@@ -1946,75 +1948,81 @@ static int 
gfx_v12_1_xcc_cp_compute_load_microcode_rs64(struct amdgpu_device *ad
        amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
        amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_data_obj);
 
-       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL);
-       tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, VMID, 0);
-       tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, EXE_DISABLE, 0);
-       tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0);
-       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL, tmp);
-
-       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL);
-       tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, VMID, 0);
-       tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, CACHE_POLICY, 0);
-       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL, tmp);
-
-       mutex_lock(&adev->srbm_mutex);
-       for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
-               soc_v1_0_grbm_select(adev, 1, i, 0, 0, GET_INST(GC, xcc_id));
+       for_each_inst(xcc_id, xcc_mask) {
+               gfx_v12_1_xcc_cp_compute_enable(adev, false, xcc_id);
 
-               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_LO,
-                            lower_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
-                                          i * ALIGN(fw_data_size, 64 * 1024)));
-               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_MDBASE_HI,
-                            upper_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
-                                          i * ALIGN(fw_data_size, 64 * 1024)));
+               tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_IC_BASE_CNTL);
+               tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, VMID, 0);
+               tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, EXE_DISABLE, 0);
+               tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0);
+               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_CNTL, 
tmp);
 
-               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_LO,
-                            lower_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
-               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_BASE_HI,
-                            upper_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
-       }
-       mutex_unlock(&adev->srbm_mutex);
-       soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, 0));
+               tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_MEC_DC_BASE_CNTL);
+               tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, VMID, 0);
+               tmp = REG_SET_FIELD(tmp, CP_MEC_DC_BASE_CNTL, CACHE_POLICY, 0);
+               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_BASE_CNTL, 
tmp);
 
-       /* Trigger an invalidation of the L1 instruction caches */
-       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL);
-       tmp = REG_SET_FIELD(tmp, CP_MEC_DC_OP_CNTL, INVALIDATE_DCACHE, 1);
-       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL, tmp);
+               mutex_lock(&adev->srbm_mutex);
+               for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++) {
+                       soc_v1_0_grbm_select(adev, 1, i, 0, 0, GET_INST(GC, 
xcc_id));
+
+                       WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_MEC_MDBASE_LO,
+                                       
lower_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
+                                                                       (xcc_id 
* adev->gfx.mec.num_pipe_per_mec + i) *
+                                                                       
ALIGN(fw_data_size, 64 * 1024)));
+                       WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_MEC_MDBASE_HI,
+                                       
upper_32_bits(adev->gfx.mec.mec_fw_data_gpu_addr +
+                                                                       (xcc_id 
* adev->gfx.mec.num_pipe_per_mec + i) *
+                                                                       
ALIGN(fw_data_size, 64 * 1024)));
+
+                       WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_IC_BASE_LO,
+                                       
lower_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
+                       WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_IC_BASE_HI,
+                                       
upper_32_bits(adev->gfx.mec.mec_fw_gpu_addr));
+               }
+               mutex_unlock(&adev->srbm_mutex);
+               soc_v1_0_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, 0));
 
-       /* Wait for invalidation complete */
-       for (i = 0; i < usec_timeout; i++) {
+               /* Trigger an invalidation of the L1 instruction caches */
                tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_MEC_DC_OP_CNTL);
-               if (1 == REG_GET_FIELD(tmp, CP_MEC_DC_OP_CNTL,
-                                      INVALIDATE_DCACHE_COMPLETE))
-                       break;
-               udelay(1);
-       }
-
-       if (i >= usec_timeout) {
-               dev_err(adev->dev, "failed to invalidate instruction cache\n");
-               return -EINVAL;
-       }
+               tmp = REG_SET_FIELD(tmp, CP_MEC_DC_OP_CNTL, INVALIDATE_DCACHE, 
1);
+               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_DC_OP_CNTL, 
tmp);
+
+               /* Wait for invalidation complete */
+               for (i = 0; i < usec_timeout; i++) {
+                       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_MEC_DC_OP_CNTL);
+                       if (1 == REG_GET_FIELD(tmp, CP_MEC_DC_OP_CNTL,
+                                               INVALIDATE_DCACHE_COMPLETE))
+                               break;
+                       udelay(1);
+               }
 
-       /* Trigger an invalidation of the L1 instruction caches */
-       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL);
-       tmp = REG_SET_FIELD(tmp, CP_CPC_IC_OP_CNTL, INVALIDATE_CACHE, 1);
-       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL, tmp);
+               if (i >= usec_timeout) {
+                       dev_err(adev->dev, "failed to invalidate instruction 
cache\n");
+                       return -EINVAL;
+               }
 
-       /* Wait for invalidation complete */
-       for (i = 0; i < usec_timeout; i++) {
+               /* Trigger an invalidation of the L1 instruction caches */
                tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_IC_OP_CNTL);
-               if (1 == REG_GET_FIELD(tmp, CP_CPC_IC_OP_CNTL,
-                                      INVALIDATE_CACHE_COMPLETE))
-                       break;
-               udelay(1);
-       }
+               tmp = REG_SET_FIELD(tmp, CP_CPC_IC_OP_CNTL, INVALIDATE_CACHE, 
1);
+               WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_IC_OP_CNTL, 
tmp);
+
+               /* Wait for invalidation complete */
+               for (i = 0; i < usec_timeout; i++) {
+                       tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regCP_CPC_IC_OP_CNTL);
+                       if (1 == REG_GET_FIELD(tmp, CP_CPC_IC_OP_CNTL,
+                                               INVALIDATE_CACHE_COMPLETE))
+                               break;
+                       udelay(1);
+               }
 
-       if (i >= usec_timeout) {
-               dev_err(adev->dev, "failed to invalidate instruction cache\n");
-               return -EINVAL;
-       }
+               if (i >= usec_timeout) {
+                       dev_err(adev->dev, "failed to invalidate instruction 
cache\n");
+                       return -EINVAL;
+               }
 
-       gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
+               gfx_v12_1_xcc_set_mec_ucode_start_addr(adev, xcc_id);
+       }
 
        return 0;
 }
@@ -2411,42 +2419,43 @@ static int gfx_v12_1_xcc_kcq_resume(struct 
amdgpu_device *adev,
        return r;
 }
 
-static int gfx_v12_1_xcc_cp_resume(struct amdgpu_device *adev,
-                                  int xcc_id)
+static int gfx_v12_1_xcc_cp_resume(struct amdgpu_device *adev, uint16_t 
xcc_mask)
 {
-       int r, i;
+       int r, i, xcc_id;
        struct amdgpu_ring *ring;
 
-       if (!(adev->flags & AMD_IS_APU))
-               gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id);
-
        if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
                /* legacy firmware loading */
-               r = gfx_v12_1_xcc_cp_compute_load_microcode_rs64(adev, xcc_id);
+               r = gfx_v12_1_xcc_cp_compute_load_microcode_rs64(adev, 
xcc_mask);
                if (r)
                        return r;
        }
 
-       gfx_v12_1_xcc_cp_set_doorbell_range(adev, xcc_id);
+       for_each_inst(xcc_id, xcc_mask) {
+               if (!(adev->flags & AMD_IS_APU))
+                       gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, 
xcc_id);
 
-       gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
+               gfx_v12_1_xcc_cp_set_doorbell_range(adev, xcc_id);
 
-       if (adev->enable_mes_kiq && adev->mes.kiq_hw_init)
-               r = amdgpu_mes_kiq_hw_init(adev, xcc_id);
-       else
-               r = gfx_v12_1_xcc_kiq_resume(adev, xcc_id);
-       if (r)
-               return r;
+               gfx_v12_1_xcc_cp_compute_enable(adev, true, xcc_id);
 
-       r = gfx_v12_1_xcc_kcq_resume(adev, xcc_id);
-       if (r)
-               return r;
+               if (adev->enable_mes_kiq && adev->mes.kiq_hw_init)
+                       r = amdgpu_mes_kiq_hw_init(adev, xcc_id);
+               else
+                       r = gfx_v12_1_xcc_kiq_resume(adev, xcc_id);
+               if (r)
+                       return r;
 
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               ring = &adev->gfx.compute_ring[i + xcc_id * 
adev->gfx.num_compute_rings];
-               r = amdgpu_ring_test_helper(ring);
+               r = gfx_v12_1_xcc_kcq_resume(adev, xcc_id);
                if (r)
                        return r;
+
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i + xcc_id * 
adev->gfx.num_compute_rings];
+                       r = amdgpu_ring_test_helper(ring);
+                       if (r)
+                               return r;
+               }
        }
 
        return 0;
@@ -3923,14 +3932,9 @@ static int gfx_v12_1_xcp_resume(void *handle, uint32_t 
inst_mask)
                }
        }
 
-       tmp_mask = inst_mask;
-       for_each_inst(i, tmp_mask) {
-               r = gfx_v12_1_xcc_cp_resume(adev, i);
-               if (r)
-                       return r;
-       }
+       r = gfx_v12_1_xcc_cp_resume(adev, inst_mask);
 
-       return 0;
+       return r;
 }
 
 static int gfx_v12_1_xcp_suspend(void *handle, uint32_t inst_mask)
-- 
2.52.0

[PATCH] drm/amdgpu: Fix CP_MEC_MDBASE in multi-xcc for gfx v12_1

Reply via email to