Support high-frequency querying sriov ras block error count: 1. Create shared memory and fills it with RAS_CMD__GET_LAL_LOC_STATUS ras command. 2. The RAS_CMD_GET_ALL_BLOCK_ECC_STATUS command and shared memory are registered to sriov host ras auto-update list via RAS_CMD_SET_CMD_AUTO_UPDATE command. 3. Once sriov host detects ras error, it will automatically execute RAS_CMD__GET_ALL_BLOCK_ECC_STATUS command and write the result to shared memory.
Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> --- .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 112 ++++++++++++++++++ .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h | 9 ++ drivers/gpu/drm/amd/ras/rascore/ras_cmd.h | 33 ++++++ 3 files changed, 154 insertions(+) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c index 992ff214f30a..73568a6e3463 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c @@ -235,9 +235,90 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core, return RAS_CMD__SUCCESS; } +static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev, + struct vram_blocks_ecc *blks_ecc) +{ + struct ras_cmd_ctx *rcmd; + + if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr) + return -EINVAL; + + rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr; + + rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS; + rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req); + rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd); + + return 0; +} + +static int __set_cmd_auto_update(struct amdgpu_device *adev, + enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg) +{ + struct ras_cmd_auto_update_req req = {0}; + struct ras_cmd_auto_update_rsp rsp = {0}; + int ret; + + req.mode = reg ? 1 : 0; + req.cmd_id = cmd_id; + req.addr = gpa_addr; + req.len = len; + ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE, + &req, sizeof(req), &rsp, sizeof(rsp)); + + return ret; +} + +static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct amdgpu_device *adev = ras_core->dev; + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct amdgpu_virt_ras_cmd *virt_ras = + (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd; + struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc; + struct ras_cmd_ctx *blks_ecc_cmd_ctx; + struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp; + struct ras_cmd_block_ecc_info_req *input_data = + (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw; + struct ras_cmd_block_ecc_info_rsp *output_data = + (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw; + int ret = 0; + + if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if (input_data->block_id >= MAX_RAS_BLOCK_NUM) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + if (__fill_get_blocks_ecc_cmd(adev, blks_ecc)) + return RAS_CMD__ERROR_GENERIC; + + if (!virt_ras->blocks_ecc.auto_update_actived) { + ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, + blks_ecc->mc_addr - adev->gmc.vram_start, + blks_ecc->size, true); + if (ret) + return ret; + + blks_ecc->auto_update_actived = true; + } + + blks_ecc_cmd_ctx = blks_ecc->cpu_addr; + blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw; + + output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count; + output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count; + output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count; + + cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp); + return RAS_CMD__SUCCESS; +} + static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = { {RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot}, {RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records}, + {RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc}, }; int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core, @@ -294,10 +375,41 @@ int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev) int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev) { + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct amdgpu_virt_ras_cmd *virt_ras = + (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd; + struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc; + + memset(blks_ecc, 0, sizeof(*blks_ecc)); + blks_ecc->size = PAGE_SIZE; + if (amdgpu_bo_create_kernel(adev, blks_ecc->size, + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, + &blks_ecc->bo, &blks_ecc->mc_addr, + (void **)&blks_ecc->cpu_addr)) + return -ENOMEM; + return 0; } int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev) { + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct amdgpu_virt_ras_cmd *virt_ras = + (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd; + struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc; + + if (blks_ecc->bo) { + __set_cmd_auto_update(adev, + RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, + blks_ecc->mc_addr - adev->gmc.vram_start, + blks_ecc->size, false); + + memset(blks_ecc->cpu_addr, 0, blks_ecc->size); + amdgpu_bo_free_kernel(&blks_ecc->bo, + &blks_ecc->mc_addr, &blks_ecc->cpu_addr); + + memset(blks_ecc, 0, sizeof(*blks_ecc)); + } + return 0; } diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h index addc693c2926..ae7bf67b3a3b 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.h @@ -30,8 +30,17 @@ struct remote_batch_trace_mgr { struct ras_cmd_batch_trace_record_rsp batch_trace; }; +struct vram_blocks_ecc { + struct amdgpu_bo *bo; + uint64_t mc_addr; + void *cpu_addr; + uint32_t size; + bool auto_update_actived; +}; + struct amdgpu_virt_ras_cmd { struct remote_batch_trace_mgr batch_mgr; + struct vram_blocks_ecc blocks_ecc; }; int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h index 48a0715eb821..b9833812c31f 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h @@ -75,6 +75,8 @@ enum ras_cmd_id { RAS_CMD__GET_CPER_RECORD, RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, RAS_CMD__GET_BATCH_TRACE_RECORD, + RAS_CMD__GET_ALL_BLOCK_ECC_STATUS, + RAS_CMD__SET_CMD_AUTO_UPDATE, RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END, }; @@ -411,6 +413,37 @@ struct ras_cmd_batch_trace_record_rsp { struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM]; }; +struct ras_cmd_auto_update_req { + struct ras_cmd_dev_handle dev; + uint32_t mode; + uint32_t cmd_id; + uint64_t addr; + uint32_t len; + uint32_t reserved[5]; +}; + +struct ras_cmd_auto_update_rsp { + uint32_t version; + uint32_t reserved[4]; +}; + +struct ras_cmd_blocks_ecc_req { + struct ras_cmd_dev_handle dev; +}; + +struct ras_cmd_block_ecc { + uint32_t ce_count; + uint32_t ue_count; + uint32_t de_count; +}; + +#define MAX_RAS_BLOCK_NUM 20 +struct ras_cmd_blocks_ecc_rsp { + uint32_t version; + uint32_t reserved[5]; + struct ras_cmd_block_ecc blocks[MAX_RAS_BLOCK_NUM]; +}; + #pragma pack(pop) int ras_cmd_init(struct ras_core_context *ras_core); -- 2.34.1
