Add psp ras common functions. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> Reviewed-by: Tao Zhou <tao.zh...@amd.com> --- drivers/gpu/drm/amd/ras/rascore/ras_psp.c | 750 ++++++++++++++++++++ drivers/gpu/drm/amd/ras/rascore/ras_psp.h | 145 ++++ drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h | 231 ++++++ 3 files changed, 1126 insertions(+) create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_psp.c create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_psp.h create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.c b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c new file mode 100644 index 000000000000..c94effd4b114 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_ta_if.h" +#include "ras_psp.h" +#include "ras_psp_v13_0.h" + +/* position of instance value in sub_block_index of + * ta_ras_trigger_error_input, the sub block uses lower 12 bits + */ +#define RAS_TA_INST_MASK 0xfffff000 +#define RAS_TA_INST_SHIFT 0xc + +static const struct ras_psp_ip_func *ras_psp_get_ip_funcs( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 12): + return &ras_psp_v13_0; + default: + RAS_DEV_ERR(ras_core->dev, + "psp ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +static int ras_psp_sync_system_ras_psp_status(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + struct ras_psp_sys_status status = {0}; + int ret; + + if (psp->sys_func && psp->sys_func->get_ras_psp_system_status) { + ret = psp->sys_func->get_ras_psp_system_status(ras_core, &status); + if (ret) + return ret; + + if (status.initialized) { + ta_ctx->preload_ras_ta_enabled = true; + ta_ctx->ras_ta_initialized = status.initialized; + ta_ctx->session_id = status.session_id; + } + + psp_ctx->external_mutex = status.psp_cmd_mutex; + } + + return 0; +} + +static int ras_psp_get_ras_ta_init_param(struct ras_core_context *ras_core, + struct ras_ta_init_param *ras_ta_param) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + if (psp->sys_func && psp->sys_func->get_ras_ta_init_param) + return psp->sys_func->get_ras_ta_init_param(ras_core, ras_ta_param); + + RAS_DEV_ERR(ras_core->dev, "Not config get_ras_ta_init_param API!!\n"); + return -EACCES; +} + +static struct gpu_mem_block *ras_psp_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type) +{ + struct ras_psp *psp = &ras_core->ras_psp; + struct gpu_mem_block *gpu_mem = NULL; + int ret; + + switch (mem_type) { + case GPU_MEM_TYPE_RAS_PSP_RING: + gpu_mem = &psp->psp_ring.ras_ring_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_PSP_CMD: + gpu_mem = &psp->psp_ctx.psp_cmd_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_PSP_FENCE: + gpu_mem = &psp->psp_ctx.out_fence_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_TA_FW: + gpu_mem = &psp->ta_ctx.fw_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_TA_CMD: + gpu_mem = &psp->ta_ctx.cmd_gpu_mem; + break; + default: + return NULL; + } + + if (!gpu_mem->ref_count) { + ret = ras_core_get_gpu_mem(ras_core, mem_type, gpu_mem); + if (ret) + return NULL; + gpu_mem->mem_type = mem_type; + } + + gpu_mem->ref_count++; + + return gpu_mem; +} + +static int ras_psp_put_gpu_mem(struct ras_core_context *ras_core, + struct gpu_mem_block *gpu_mem) +{ + if (!gpu_mem) + return 0; + + gpu_mem->ref_count--; + + if (gpu_mem->ref_count > 0) { + return 0; + } else if (gpu_mem->ref_count < 0) { + RAS_DEV_WARN(ras_core->dev, + "Duplicate free gpu memory %u\n", gpu_mem->mem_type); + } else { + ras_core_put_gpu_mem(ras_core, gpu_mem->mem_type, gpu_mem); + memset(gpu_mem, 0, sizeof(*gpu_mem)); + } + + return 0; +} + +static void __acquire_psp_cmd_lock(struct ras_core_context *ras_core) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + + if (psp_ctx->external_mutex) + mutex_lock(psp_ctx->external_mutex); + else + mutex_lock(&psp_ctx->internal_mutex); +} + +static void __release_psp_cmd_lock(struct ras_core_context *ras_core) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + + if (psp_ctx->external_mutex) + mutex_unlock(psp_ctx->external_mutex); + else + mutex_unlock(&psp_ctx->internal_mutex); +} + +static uint32_t __get_ring_frame_slot(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + uint32_t ras_ring_wptr_dw; + + ras_ring_wptr_dw = psp->ip_func->psp_ras_ring_wptr_get(ras_core); + + return (ras_ring_wptr_dw << 2) / sizeof(struct psp_gfx_rb_frame); +} + +static int __set_ring_frame_slot(struct ras_core_context *ras_core, + uint32_t slot) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + return psp->ip_func->psp_ras_ring_wptr_set(ras_core, + (slot * sizeof(struct psp_gfx_rb_frame)) >> 2); +} + +static int write_frame_to_ras_psp_ring(struct ras_core_context *ras_core, + struct psp_gfx_rb_frame *frame) +{ + struct gpu_mem_block *ring_mem; + struct psp_gfx_rb_frame *rb_frame; + uint32_t max_frame_slot; + uint32_t slot_idx; + uint32_t write_flush_read_back = 0; + int ret = 0; + + ring_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_RING); + if (!ring_mem) + return -ENOMEM; + + max_frame_slot = + ring_mem->mem_size / sizeof(struct psp_gfx_rb_frame); + + rb_frame = + (struct psp_gfx_rb_frame *)ring_mem->mem_cpu_addr; + + slot_idx = __get_ring_frame_slot(ras_core); + if (slot_idx >= max_frame_slot) + slot_idx = 0; + + memcpy(&rb_frame[slot_idx], frame, sizeof(*frame)); + + /* Do a read to force the write of the frame before writing + * write pointer. + */ + write_flush_read_back = rb_frame[slot_idx].fence_value; + if (write_flush_read_back != frame->fence_value) { + RAS_DEV_ERR(ras_core->dev, + "Failed to submit ring cmd! cmd:0x%x:0x%x, fence:0x%x:0x%x value:%u, expected:%u\n", + rb_frame[slot_idx].cmd_buf_addr_hi, + rb_frame[slot_idx].cmd_buf_addr_lo, + rb_frame[slot_idx].fence_addr_hi, + rb_frame[slot_idx].fence_addr_lo, + write_flush_read_back, frame->fence_value); + ret = -EACCES; + goto err; + } + + slot_idx++; + + if (slot_idx >= max_frame_slot) + slot_idx = 0; + + __set_ring_frame_slot(ras_core, slot_idx); + +err: + ras_psp_put_gpu_mem(ras_core, ring_mem); + return ret; +} + +static int send_psp_cmd(struct ras_core_context *ras_core, + enum psp_gfx_cmd_id gfx_cmd_id, void *cmd_data, + uint32_t cmd_size, struct psp_cmd_resp *resp) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + struct gpu_mem_block *psp_cmd_buf = NULL; + struct gpu_mem_block *psp_fence_buf = NULL; + struct psp_gfx_cmd_resp *gfx_cmd; + struct psp_gfx_rb_frame rb_frame; + int ret = 0; + int timeout = 1000; + + if (!cmd_data || (cmd_size > sizeof(union psp_gfx_commands)) || !resp) { + RAS_DEV_ERR(ras_core->dev, "Invalid RAS PSP command, id: %u\n", gfx_cmd_id); + return -EINVAL; + } + + __acquire_psp_cmd_lock(ras_core); + + psp_cmd_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_CMD); + if (!psp_cmd_buf) { + ret = -ENOMEM; + goto exit; + } + + psp_fence_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_FENCE); + if (!psp_fence_buf) { + ret = -ENOMEM; + goto exit; + } + + gfx_cmd = (struct psp_gfx_cmd_resp *)psp_cmd_buf->mem_cpu_addr; + memset(gfx_cmd, 0, sizeof(*gfx_cmd)); + gfx_cmd->cmd_id = gfx_cmd_id; + memcpy(&gfx_cmd->cmd, cmd_data, cmd_size); + + psp_ctx->in_fence_value++; + + memset(&rb_frame, 0, sizeof(rb_frame)); + rb_frame.cmd_buf_addr_hi = upper_32_bits(psp_cmd_buf->mem_mc_addr); + rb_frame.cmd_buf_addr_lo = lower_32_bits(psp_cmd_buf->mem_mc_addr); + rb_frame.fence_addr_hi = upper_32_bits(psp_fence_buf->mem_mc_addr); + rb_frame.fence_addr_lo = lower_32_bits(psp_fence_buf->mem_mc_addr); + rb_frame.fence_value = psp_ctx->in_fence_value; + + ret = write_frame_to_ras_psp_ring(ras_core, &rb_frame); + if (ret) { + psp_ctx->in_fence_value--; + goto exit; + } + + while (*((uint64_t *)psp_fence_buf->mem_cpu_addr) != + psp_ctx->in_fence_value) { + if (--timeout == 0) + break; + /* + * Shouldn't wait for timeout when err_event_athub occurs, + * because gpu reset thread triggered and lock resource should + * be released for psp resume sequence. + */ + if (ras_core_ras_interrupt_detected(ras_core)) + break; + + msleep(2); + } + + resp->status = gfx_cmd->resp.status; + resp->session_id = gfx_cmd->resp.session_id; + +exit: + ras_psp_put_gpu_mem(ras_core, psp_cmd_buf); + ras_psp_put_gpu_mem(ras_core, psp_fence_buf); + + __release_psp_cmd_lock(ras_core); + + return ret; +} + +static void __check_ras_ta_cmd_resp(struct ras_core_context *ras_core, + struct ras_ta_cmd *ras_cmd) +{ + + if (ras_cmd->ras_out_message.flags.err_inject_switch_disable_flag) { + RAS_DEV_WARN(ras_core->dev, "ECC switch disabled\n"); + ras_cmd->ras_status = RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE; + } else if (ras_cmd->ras_out_message.flags.reg_access_failure_flag) + RAS_DEV_WARN(ras_core->dev, "RAS internal register access blocked\n"); + + switch (ras_cmd->ras_status) { + case RAS_TA_STATUS__ERROR_UNSUPPORTED_IP: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: cmd failed due to unsupported ip\n"); + break; + case RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: cmd failed due to unsupported error injection\n"); + break; + case RAS_TA_STATUS__SUCCESS: + break; + case RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED: + if (ras_cmd->cmd_id == RAS_TA_CMD_ID__TRIGGER_ERROR) + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: Inject error to critical region is not allowed\n"); + break; + default: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: ras status = 0x%X\n", ras_cmd->ras_status); + break; + } +} + +static int send_ras_ta_runtime_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id, void *in, uint32_t in_size, + void *out, uint32_t out_size) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct gpu_mem_block *cmd_mem; + struct ras_ta_cmd *ras_cmd; + struct psp_gfx_cmd_invoke_cmd invoke_cmd = {0}; + struct psp_cmd_resp resp = {0}; + int ret = 0; + + if (!in || (in_size > sizeof(union ras_ta_cmd_input)) || + (cmd_id >= MAX_RAS_TA_CMD_ID)) { + RAS_DEV_ERR(ras_core->dev, "Invalid RAS TA command, id: %u\n", cmd_id); + return -EINVAL; + } + + ras_psp_sync_system_ras_psp_status(ras_core); + + cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD); + if (!cmd_mem) + return -ENOMEM; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) { + ret = -EACCES; + goto out; + } + + ras_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr; + + mutex_lock(&ta_ctx->ta_mutex); + + memset(ras_cmd, 0, sizeof(*ras_cmd)); + ras_cmd->cmd_id = cmd_id; + memcpy(&ras_cmd->ras_in_message, in, in_size); + + invoke_cmd.ta_cmd_id = cmd_id; + invoke_cmd.session_id = ta_ctx->session_id; + + ret = send_psp_cmd(ras_core, GFX_CMD_ID_INVOKE_CMD, + &invoke_cmd, sizeof(invoke_cmd), &resp); + + /* If err_event_athub occurs error inject was successful, however + * return status from TA is no long reliable + */ + if (ras_core_ras_interrupt_detected(ras_core)) { + ret = 0; + goto unlock; + } + + if (ret || resp.status) { + RAS_DEV_ERR(ras_core->dev, + "RAS: Failed to send psp cmd! ret:%d, status:%u\n", + ret, resp.status); + ret = -ESTRPIPE; + goto unlock; + } + + if (ras_cmd->if_version > RAS_TA_HOST_IF_VER) { + RAS_DEV_WARN(ras_core->dev, "RAS: Unsupported Interface\n"); + ret = -EINVAL; + goto unlock; + } + + if (!ras_cmd->ras_status && out && out_size) + memcpy(out, &ras_cmd->ras_out_message, out_size); + + __check_ras_ta_cmd_resp(ras_core, ras_cmd); + +unlock: + mutex_unlock(&ta_ctx->ta_mutex); + ras_core_up_gpu_reset_lock(ras_core); +out: + ras_psp_put_gpu_mem(ras_core, cmd_mem); + return ret; +} + +static int trigger_ras_ta_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask) +{ + uint32_t dev_mask = 0; + + switch (info->block_id) { + case RAS_TA_BLOCK__GFX: + if (ras_gfx_get_ta_subblock(ras_core, info->inject_error_type, + info->sub_block_index, &info->sub_block_index)) + return -EINVAL; + + dev_mask = RAS_GET_MASK(ras_core->dev, GC, instance_mask); + break; + case RAS_TA_BLOCK__SDMA: + dev_mask = RAS_GET_MASK(ras_core->dev, SDMA0, instance_mask); + break; + case RAS_TA_BLOCK__VCN: + case RAS_TA_BLOCK__JPEG: + dev_mask = RAS_GET_MASK(ras_core->dev, VCN, instance_mask); + break; + default: + dev_mask = instance_mask; + break; + } + + /* reuse sub_block_index for backward compatibility */ + dev_mask <<= RAS_TA_INST_SHIFT; + dev_mask &= RAS_TA_INST_MASK; + info->sub_block_index |= dev_mask; + + return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__TRIGGER_ERROR, + info, sizeof(*info), NULL, 0); +} + +static int send_load_ta_fw_cmd(struct ras_core_context *ras_core, + struct ras_ta_ctx *ta_ctx) +{ + struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin; + struct gpu_mem_block *fw_mem; + struct gpu_mem_block *cmd_mem; + struct ras_ta_cmd *ta_cmd; + struct ras_ta_init_flags *ta_init_flags; + struct psp_gfx_cmd_load_ta psp_load_ta_cmd; + struct psp_cmd_resp resp = {0}; + struct ras_ta_image_header *fw_hdr = NULL; + int ret; + + fw_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_FW); + if (!fw_mem) + return -ENOMEM; + + cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD); + if (!cmd_mem) { + ret = -ENOMEM; + goto err; + } + + ret = ras_psp_get_ras_ta_init_param(ras_core, &ta_ctx->init_param); + if (ret) + goto err; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) { + ret = -EACCES; + goto err; + } + + /* copy ras ta binary to shared gpu memory */ + memcpy(fw_mem->mem_cpu_addr, fw_bin->bin_addr, fw_bin->bin_size); + fw_mem->mem_size = fw_bin->bin_size; + + /* Initialize ras ta startup parameter */ + ta_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr; + ta_init_flags = &ta_cmd->ras_in_message.init_flags; + + ta_init_flags->poison_mode_en = ta_ctx->init_param.poison_mode_en; + ta_init_flags->dgpu_mode = ta_ctx->init_param.dgpu_mode; + ta_init_flags->xcc_mask = ta_ctx->init_param.xcc_mask; + ta_init_flags->channel_dis_num = ta_ctx->init_param.channel_dis_num; + ta_init_flags->nps_mode = ta_ctx->init_param.nps_mode; + ta_init_flags->active_umc_mask = ta_ctx->init_param.active_umc_mask; + + /* Setup load ras ta command */ + memset(&psp_load_ta_cmd, 0, sizeof(psp_load_ta_cmd)); + psp_load_ta_cmd.app_phy_addr_lo = lower_32_bits(fw_mem->mem_mc_addr); + psp_load_ta_cmd.app_phy_addr_hi = upper_32_bits(fw_mem->mem_mc_addr); + psp_load_ta_cmd.app_len = fw_mem->mem_size; + psp_load_ta_cmd.cmd_buf_phy_addr_lo = lower_32_bits(cmd_mem->mem_mc_addr); + psp_load_ta_cmd.cmd_buf_phy_addr_hi = upper_32_bits(cmd_mem->mem_mc_addr); + psp_load_ta_cmd.cmd_buf_len = cmd_mem->mem_size; + + ret = send_psp_cmd(ras_core, GFX_CMD_ID_LOAD_TA, + &psp_load_ta_cmd, sizeof(psp_load_ta_cmd), &resp); + if (!ret && !resp.status) { + /* Read TA version at FW offset 0x60 if TA version not found*/ + fw_hdr = (struct ras_ta_image_header *)fw_bin->bin_addr; + RAS_DEV_INFO(ras_core->dev, "PSP: RAS TA(version:%X.%X.%X.%X) is loaded.\n", + (fw_hdr->image_version >> 24) & 0xFF, (fw_hdr->image_version >> 16) & 0xFF, + (fw_hdr->image_version >> 8) & 0xFF, fw_hdr->image_version & 0xFF); + ta_ctx->ta_version = fw_hdr->image_version; + ta_ctx->session_id = resp.session_id; + ta_ctx->ras_ta_initialized = true; + } else { + RAS_DEV_ERR(ras_core->dev, + "Failed to load RAS TA! ret:%d, status:%d\n", ret, resp.status); + } + + ras_core_up_gpu_reset_lock(ras_core); + +err: + ras_psp_put_gpu_mem(ras_core, fw_mem); + ras_psp_put_gpu_mem(ras_core, cmd_mem); + return ret; +} + +static int load_ras_ta_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin; + int ret; + + fw_bin->bin_addr = ras_ta_load->bin_addr; + fw_bin->bin_size = ras_ta_load->bin_size; + fw_bin->fw_version = ras_ta_load->fw_version; + fw_bin->feature_version = ras_ta_load->feature_version; + + ret = send_load_ta_fw_cmd(ras_core, ta_ctx); + if (!ret) { + ras_ta_load->out_session_id = ta_ctx->session_id; + ras_ta_load->out_loaded_ta_version = ta_ctx->ta_version; + } + + return ret; +} + +static int unload_ras_ta_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct psp_gfx_cmd_unload_ta cmd_unload_ta = {0}; + struct psp_cmd_resp resp = {0}; + int ret; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) + return -EACCES; + + cmd_unload_ta.session_id = ta_ctx->session_id; + ret = send_psp_cmd(ras_core, GFX_CMD_ID_UNLOAD_TA, + &cmd_unload_ta, sizeof(cmd_unload_ta), &resp); + if (ret || resp.status) { + RAS_DEV_ERR(ras_core->dev, + "Failed to unload RAS TA! ret:%d, status:%u\n", + ret, resp.status); + goto unlock; + } + + kfree(ta_ctx->fw_bin.bin_addr); + memset(&ta_ctx->fw_bin, 0, sizeof(ta_ctx->fw_bin)); + ta_ctx->ta_version = 0; + ta_ctx->ras_ta_initialized = false; + ta_ctx->session_id = 0; + +unlock: + ras_core_up_gpu_reset_lock(ras_core); + + return ret; +} + +int ras_psp_load_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_psp_ta_unload ras_ta_unload = {0}; + int ret; + + if (ta_ctx->preload_ras_ta_enabled) + return 0; + + if (!ras_ta_load) + return -EINVAL; + + if (ta_ctx->ras_ta_initialized) { + ras_ta_unload.ras_session_id = ta_ctx->session_id; + ret = unload_ras_ta_firmware(ras_core, &ras_ta_unload); + if (ret) + return ret; + } + + return load_ras_ta_firmware(ras_core, ras_ta_load); +} + +int ras_psp_unload_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (ta_ctx->preload_ras_ta_enabled) + return 0; + + if ((!ras_ta_unload) || + (ras_ta_unload->ras_session_id != ta_ctx->session_id)) + return -EINVAL; + + return unload_ras_ta_firmware(ras_core, ras_ta_unload); +} + +int ras_psp_trigger_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) { + RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!"); + return -ENOEXEC; + } + + if (!info) + return -EINVAL; + + return trigger_ras_ta_error(ras_core, info, instance_mask); +} + +int ras_psp_query_address(struct ras_core_context *ras_core, + struct ras_ta_query_address_input *addr_in, + struct ras_ta_query_address_output *addr_out) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (!ta_ctx->preload_ras_ta_enabled && + !ta_ctx->ras_ta_initialized) { + RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!"); + return -ENOEXEC; + } + + if (!addr_in || !addr_out) + return -EINVAL; + + return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS, + addr_in, sizeof(*addr_in), addr_out, sizeof(*addr_out)); +} + +int ras_psp_sw_init(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + memset(psp, 0, sizeof(*psp)); + + psp->sys_func = ras_core->config->psp_cfg.psp_sys_fn; + if (!psp->sys_func) { + RAS_DEV_ERR(ras_core->dev, "RAS psp sys function not configured!\n"); + return -EINVAL; + } + + mutex_init(&psp->psp_ctx.internal_mutex); + mutex_init(&psp->ta_ctx.ta_mutex); + + return 0; +} + +int ras_psp_sw_fini(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + mutex_destroy(&psp->psp_ctx.internal_mutex); + mutex_destroy(&psp->ta_ctx.ta_mutex); + + memset(psp, 0, sizeof(*psp)); + + return 0; +} + +int ras_psp_hw_init(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + psp->psp_ip_version = ras_core->config->psp_ip_version; + + psp->ip_func = ras_psp_get_ip_funcs(ras_core, psp->psp_ip_version); + if (!psp->ip_func) + return -EINVAL; + + /* After GPU reset, the system RAS PSP status may change. + * therefore, it is necessary to synchronize the system status again. + */ + ras_psp_sync_system_ras_psp_status(ras_core); + + return 0; +} + +int ras_psp_hw_fini(struct ras_core_context *ras_core) +{ + return 0; +} + +bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + bool ret = false; + + if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) + return false; + + switch (cmd_id) { + case RAS_TA_CMD_ID__QUERY_ADDRESS: + /* Currently, querying the address from RAS TA is only supported + * when the RAS TA firmware is loaded during driver installation. + */ + if (ta_ctx->preload_ras_ta_enabled) + ret = true; + break; + case RAS_TA_CMD_ID__TRIGGER_ERROR: + ret = true; + break; + default: + ret = false; + break; + } + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.h b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h new file mode 100644 index 000000000000..71776fecfd66 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_PSP_H__ +#define __RAS_PSP_H__ +#include "ras.h" +#include "ras_ta_if.h" + +struct ras_core_context; +struct ras_ta_trigger_error_input; +struct ras_ta_query_address_input; +struct ras_ta_query_address_output; +enum ras_ta_cmd_id; + +struct ras_ta_image_header { + uint32_t reserved1[24]; + uint32_t image_version; /* [0x60] Off Chip Firmware Version */ + uint32_t reserved2[39]; +}; + +struct ras_psp_sys_status { + bool initialized; + uint32_t session_id; + void *psp_cmd_mutex; +}; + +struct ras_ta_init_param { + uint8_t poison_mode_en; + uint8_t dgpu_mode; + uint16_t xcc_mask; + uint8_t channel_dis_num; + uint8_t nps_mode; + uint32_t active_umc_mask; +}; + +struct gpu_mem_block { + uint32_t mem_type; + void *mem_bo; + uint64_t mem_mc_addr; + void *mem_cpu_addr; + uint32_t mem_size; + int ref_count; + void *private; +}; + +struct ras_psp_ip_func { + uint32_t (*psp_ras_ring_wptr_get)(struct ras_core_context *ras_core); + int (*psp_ras_ring_wptr_set)(struct ras_core_context *ras_core, uint32_t wptr); +}; + +struct ras_psp_ring { + struct gpu_mem_block ras_ring_gpu_mem; +}; + +struct psp_cmd_resp { + uint32_t status; + uint32_t session_id; +}; + +struct ras_psp_ctx { + void *external_mutex; + struct mutex internal_mutex; + uint64_t in_fence_value; + struct gpu_mem_block psp_cmd_gpu_mem; + struct gpu_mem_block out_fence_gpu_mem; +}; + +struct ras_ta_fw_bin { + uint32_t fw_version; + uint32_t feature_version; + uint32_t bin_size; + uint8_t *bin_addr; +}; + +struct ras_ta_ctx { + bool preload_ras_ta_enabled; + bool ras_ta_initialized; + uint32_t session_id; + uint32_t resp_status; + uint32_t ta_version; + struct mutex ta_mutex; + struct ras_ta_fw_bin fw_bin; + struct ras_ta_init_param init_param; + struct gpu_mem_block fw_gpu_mem; + struct gpu_mem_block cmd_gpu_mem; +}; + +struct ras_psp { + uint32_t psp_ip_version; + struct ras_psp_ring psp_ring; + struct ras_psp_ctx psp_ctx; + struct ras_ta_ctx ta_ctx; + const struct ras_psp_ip_func *ip_func; + const struct ras_psp_sys_func *sys_func; +}; + +struct ras_psp_ta_load { + uint32_t fw_version; + uint32_t feature_version; + uint32_t bin_size; + uint8_t *bin_addr; + uint64_t out_session_id; + uint32_t out_loaded_ta_version; +}; + +struct ras_psp_ta_unload { + uint64_t ras_session_id; +}; + +int ras_psp_sw_init(struct ras_core_context *ras_core); +int ras_psp_sw_fini(struct ras_core_context *ras_core); +int ras_psp_hw_init(struct ras_core_context *ras_core); +int ras_psp_hw_fini(struct ras_core_context *ras_core); +int ras_psp_load_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load); +int ras_psp_unload_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload); +int ras_psp_trigger_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask); +int ras_psp_query_address(struct ras_core_context *ras_core, + struct ras_ta_query_address_input *addr_in, + struct ras_ta_query_address_output *addr_out); +bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h new file mode 100644 index 000000000000..0921e36d3274 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _RAS_TA_IF_H +#define _RAS_TA_IF_H +#include "ras.h" + +#define RAS_TA_HOST_IF_VER 0 + +/* Responses have bit 31 set */ +#define RSP_ID_MASK (1U << 31) +#define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK) + +/* invalid node instance value */ +#define RAS_TA_INV_NODE 0xffff + +/* RAS related enumerations */ +/**********************************************************/ +enum ras_ta_cmd_id { + RAS_TA_CMD_ID__ENABLE_FEATURES = 0, + RAS_TA_CMD_ID__DISABLE_FEATURES, + RAS_TA_CMD_ID__TRIGGER_ERROR, + RAS_TA_CMD_ID__QUERY_BLOCK_INFO, + RAS_TA_CMD_ID__QUERY_SUB_BLOCK_INFO, + RAS_TA_CMD_ID__QUERY_ADDRESS, + MAX_RAS_TA_CMD_ID +}; + +enum ras_ta_status { + RAS_TA_STATUS__SUCCESS = 0x0000, + RAS_TA_STATUS__RESET_NEEDED = 0xA001, + RAS_TA_STATUS__ERROR_INVALID_PARAMETER = 0xA002, + RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE = 0xA003, + RAS_TA_STATUS__ERROR_RAS_DUPLICATE_CMD = 0xA004, + RAS_TA_STATUS__ERROR_INJECTION_FAILED = 0xA005, + RAS_TA_STATUS__ERROR_ASD_READ_WRITE = 0xA006, + RAS_TA_STATUS__ERROR_TOGGLE_DF_CSTATE = 0xA007, + RAS_TA_STATUS__ERROR_TIMEOUT = 0xA008, + RAS_TA_STATUS__ERROR_BLOCK_DISABLED = 0XA009, + RAS_TA_STATUS__ERROR_GENERIC = 0xA00A, + RAS_TA_STATUS__ERROR_RAS_MMHUB_INIT = 0xA00B, + RAS_TA_STATUS__ERROR_GET_DEV_INFO = 0xA00C, + RAS_TA_STATUS__ERROR_UNSUPPORTED_DEV = 0xA00D, + RAS_TA_STATUS__ERROR_NOT_INITIALIZED = 0xA00E, + RAS_TA_STATUS__ERROR_TEE_INTERNAL = 0xA00F, + RAS_TA_STATUS__ERROR_UNSUPPORTED_FUNCTION = 0xA010, + RAS_TA_STATUS__ERROR_SYS_DRV_REG_ACCESS = 0xA011, + RAS_TA_STATUS__ERROR_RAS_READ_WRITE = 0xA012, + RAS_TA_STATUS__ERROR_NULL_PTR = 0xA013, + RAS_TA_STATUS__ERROR_UNSUPPORTED_IP = 0xA014, + RAS_TA_STATUS__ERROR_PCS_STATE_QUIET = 0xA015, + RAS_TA_STATUS__ERROR_PCS_STATE_ERROR = 0xA016, + RAS_TA_STATUS__ERROR_PCS_STATE_HANG = 0xA017, + RAS_TA_STATUS__ERROR_PCS_STATE_UNKNOWN = 0xA018, + RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ = 0xA019, + RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED = 0xA01A +}; + +enum ras_ta_block { + RAS_TA_BLOCK__UMC = 0, + RAS_TA_BLOCK__SDMA, + RAS_TA_BLOCK__GFX, + RAS_TA_BLOCK__MMHUB, + RAS_TA_BLOCK__ATHUB, + RAS_TA_BLOCK__PCIE_BIF, + RAS_TA_BLOCK__HDP, + RAS_TA_BLOCK__XGMI_WAFL, + RAS_TA_BLOCK__DF, + RAS_TA_BLOCK__SMN, + RAS_TA_BLOCK__SEM, + RAS_TA_BLOCK__MP0, + RAS_TA_BLOCK__MP1, + RAS_TA_BLOCK__FUSE, + RAS_TA_BLOCK__MCA, + RAS_TA_BLOCK__VCN, + RAS_TA_BLOCK__JPEG, + RAS_TA_BLOCK__IH, + RAS_TA_BLOCK__MPIO, + RAS_TA_BLOCK__MMSCH, + RAS_TA_NUM_BLOCK_MAX +}; + +enum ras_ta_mca_block { + RAS_TA_MCA_BLOCK__MP0 = 0, + RAS_TA_MCA_BLOCK__MP1 = 1, + RAS_TA_MCA_BLOCK__MPIO = 2, + RAS_TA_MCA_BLOCK__IOHC = 3, + RAS_TA_MCA_NUM_BLOCK_MAX +}; + +enum ras_ta_error_type { + RAS_TA_ERROR__NONE = 0, + RAS_TA_ERROR__PARITY = 1, + RAS_TA_ERROR__SINGLE_CORRECTABLE = 2, + RAS_TA_ERROR__MULTI_UNCORRECTABLE = 4, + RAS_TA_ERROR__POISON = 8, +}; + +enum ras_ta_address_type { + RAS_TA_MCA_TO_PA, + RAS_TA_PA_TO_MCA, +}; + +enum ras_ta_nps_mode { + RAS_TA_UNKNOWN_MODE = 0, + RAS_TA_NPS1_MODE = 1, + RAS_TA_NPS2_MODE = 2, + RAS_TA_NPS4_MODE = 4, + RAS_TA_NPS8_MODE = 8, +}; + +/* Input/output structures for RAS commands */ +/**********************************************************/ + +struct ras_ta_enable_features_input { + enum ras_ta_block block_id; + enum ras_ta_error_type error_type; +}; + +struct ras_ta_disable_features_input { + enum ras_ta_block block_id; + enum ras_ta_error_type error_type; +}; + +struct ras_ta_trigger_error_input { + /* ras-block. i.e. umc, gfx */ + enum ras_ta_block block_id; + + /* type of error. i.e. single_correctable */ + enum ras_ta_error_type inject_error_type; + + /* mem block. i.e. hbm, sram etc. */ + uint32_t sub_block_index; + + /* explicit address of error */ + uint64_t address; + + /* method if error injection. i.e persistent, coherent etc. */ + uint64_t value; +}; + +struct ras_ta_init_flags { + uint8_t poison_mode_en; + uint8_t dgpu_mode; + uint16_t xcc_mask; + uint8_t channel_dis_num; + uint8_t nps_mode; + uint32_t active_umc_mask; +}; + +struct ras_ta_mca_addr { + uint64_t err_addr; + uint32_t ch_inst; + uint32_t umc_inst; + uint32_t node_inst; + uint32_t socket_id; +}; + +struct ras_ta_phy_addr { + uint64_t pa; + uint32_t bank; + uint32_t channel_idx; +}; + +struct ras_ta_query_address_input { + enum ras_ta_address_type addr_type; + struct ras_ta_mca_addr ma; + struct ras_ta_phy_addr pa; +}; + +struct ras_ta_output_flags { + uint8_t ras_init_success_flag; + uint8_t err_inject_switch_disable_flag; + uint8_t reg_access_failure_flag; +}; + +struct ras_ta_query_address_output { + /* don't use the flags here */ + struct ras_ta_output_flags flags; + struct ras_ta_mca_addr ma; + struct ras_ta_phy_addr pa; +}; + +/* Common input structure for RAS callbacks */ +/**********************************************************/ +union ras_ta_cmd_input { + struct ras_ta_init_flags init_flags; + struct ras_ta_enable_features_input enable_features; + struct ras_ta_disable_features_input disable_features; + struct ras_ta_trigger_error_input trigger_error; + struct ras_ta_query_address_input address; + uint32_t reserve_pad[256]; +}; + +union ras_ta_cmd_output { + struct ras_ta_output_flags flags; + struct ras_ta_query_address_output address; + uint32_t reserve_pad[256]; +}; + +struct ras_ta_cmd { + uint32_t cmd_id; + uint32_t resp_id; + uint32_t ras_status; + uint32_t if_version; + union ras_ta_cmd_input ras_in_message; + union ras_ta_cmd_output ras_out_message; +}; + +#endif -- 2.34.1