[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Alex Deucher <[email protected]>
Sent: Wednesday, May 29, 2024 7:16 PM
To: Khatri, Sunil <[email protected]>
Cc: Deucher, Alexander <[email protected]>; Koenig, Christian 
<[email protected]>; [email protected]
Subject: Re: [PATCH v1 1/3] drm/amdgpu: add gfx9 register support in ipdump

On Wed, May 29, 2024 at 5:50 AM Sunil Khatri <[email protected]> wrote:
>
> Add general registers of gfx9 in ipdump for devcoredump support.
>
> Signed-off-by: Sunil Khatri <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 124
> +++++++++++++++++++++++++-
>  1 file changed, 123 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 3c8c5abf35ab..528a20393313 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -149,6 +149,94 @@ MODULE_FIRMWARE("amdgpu/aldebaran_sjt_mec2.bin");
>  #define mmGOLDEN_TSC_COUNT_LOWER_Renoir                0x0026
>  #define mmGOLDEN_TSC_COUNT_LOWER_Renoir_BASE_IDX       1
>
> +static const struct amdgpu_hwip_reg_entry gc_reg_list_9[] = {
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQ_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmVM_L2_PROTECTION_FAULT_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmVM_L2_PROTECTION_FAULT_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6),
> +       /* cp header registers */
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_ME1_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_ME2_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_HEADER_DUMP),
> +       /* SE status registers */
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE0),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE3) };

This list should probably be split between gfx and non-gfx registers since MI 
chips don't have a gfx pipe, only compute.  If
adev->gfx.num_gfx_rings == 0, then the chip does not have a gfx pipe.

Sure Alex. I will split the registers in GFX specific and Non GFX based 
registers. Would it be correct way to see which registers are part of mi300 are 
non gfx and registers which are not are gfx ??
I just gave a quick look and all the registers that are part of the patches 
that we are dumping already all are available in mi300 spec sheet.

Also MI300 probably needs special handling because it has multiple XCCs.  For 
now, you might want to skip the IP dump for gfx9 chips with
adev->gfx.num_gfx_rings == 0, and then add new register lists for the
MI parts later.

For MI300 by multiple XCC do you mean physically different nodes/multiple 
nodes. If my understanding is correct we are interested in dumping only the 
GPU/node which caused the hang isn’t it?

Regards
Sunil Khatri

Alex

> +
>  enum ta_ras_gfx_subblock {
>         /*CPC*/
>         TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0, @@ -1994,6 +2082,20 @@
> static int gfx_v9_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
>                                 hw_prio, NULL);  }
>
> +static void gfx_v9_0_alloc_ip_dump(struct amdgpu_device *adev) {
> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_9);
> +       uint32_t *ptr;
> +
> +       ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL);
> +       if (ptr == NULL) {
> +               DRM_ERROR("Failed to allocate memory for IP Dump\n");
> +               adev->gfx.ip_dump_core = NULL;
> +       } else {
> +               adev->gfx.ip_dump_core = ptr;
> +       }
> +}
> +
>  static int gfx_v9_0_sw_init(void *handle)  {
>         int i, j, k, r, ring_id;
> @@ -2171,6 +2273,8 @@ static int gfx_v9_0_sw_init(void *handle)
>                 return -EINVAL;
>         }
>
> +       gfx_v9_0_alloc_ip_dump(adev);
> +
>         return 0;
>  }
>
> @@ -2206,6 +2310,8 @@ static int gfx_v9_0_sw_fini(void *handle)
>         }
>         gfx_v9_0_free_microcode(adev);
>
> +       kfree(adev->gfx.ip_dump_core);
> +
>         return 0;
>  }
>
> @@ -6840,6 +6946,22 @@ static void gfx_v9_0_emit_wave_limit(struct 
> amdgpu_ring *ring, bool enable)
>         }
>  }
>
> +static void gfx_v9_ip_dump(void *handle) {
> +       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +       uint32_t i;
> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_9);
> +
> +       if (!adev->gfx.ip_dump_core)
> +               return;
> +
> +       amdgpu_gfx_off_ctrl(adev, false);
> +       for (i = 0; i < reg_count; i++)
> +               adev->gfx.ip_dump_core[i] = 
> RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_9[i]));
> +       amdgpu_gfx_off_ctrl(adev, true);
> +
> +}
> +
>  static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
>         .name = "gfx_v9_0",
>         .early_init = gfx_v9_0_early_init, @@ -6856,7 +6978,7 @@
> static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
>         .set_clockgating_state = gfx_v9_0_set_clockgating_state,
>         .set_powergating_state = gfx_v9_0_set_powergating_state,
>         .get_clockgating_state = gfx_v9_0_get_clockgating_state,
> -       .dump_ip_state = NULL,
> +       .dump_ip_state = gfx_v9_ip_dump,
>         .print_ip_state = NULL,
>  };
>
> --
> 2.34.1
>

Reply via email to