Add amdgpu ras system functions. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> Reviewed-by: Tao Zhou <tao.zh...@amd.com> --- .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 268 ++++++++++++++++++ drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h | 109 +++++++ 2 files changed, 377 insertions(+) create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c new file mode 100644 index 000000000000..40071b876333 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras_sys.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras.h" +#include "amdgpu_reset.h" + +static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint64_t seq_no; + + seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE); + RAS_DEV_INFO(adev, + "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n", + seq_no); + + return amdgpu_ras_process_handle_unexpected_interrupt(adev, data); +} + +static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context *ras_core, + void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ras_event_req *req = (struct ras_event_req *)data; + pasid_notify pasid_fn; + + if (!req) + return -EINVAL; + + if (req->pasid_fn) { + pasid_fn = (pasid_notify)req->pasid_fn; + pasid_fn(adev, req->pasid, req->data); + } + + return 0; +} + +static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, uint64_t *seqno) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_event_manager *event_mgr; + struct ras_event_state *event_state; + struct amdgpu_hive_info *hive; + enum ras_event_type event_type; + uint64_t seq_no; + + if (!ras_mgr || !seqno || + (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)) + return -EINVAL; + + switch (seqno_type) { + case RAS_SEQNO_TYPE_UE: + event_type = RAS_EVENT_TYPE_FATAL; + break; + case RAS_SEQNO_TYPE_CE: + case RAS_SEQNO_TYPE_DE: + event_type = RAS_EVENT_TYPE_POISON_CREATION; + break; + case RAS_SEQNO_TYPE_POISON_CONSUMPTION: + event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + break; + default: + event_type = RAS_EVENT_TYPE_INVALID; + break; + } + + hive = amdgpu_get_xgmi_hive(adev); + event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr; + event_state = &event_mgr->event_state[event_type]; + if ((event_type == RAS_EVENT_TYPE_FATAL) && amdgpu_ras_in_recovery(adev)) { + seq_no = event_state->last_seqno; + } else { + seq_no = atomic64_inc_return(&event_mgr->seqno); + event_state->last_seqno = seq_no; + atomic64_inc(&event_state->count); + } + amdgpu_put_xgmi_hive(hive); + + *seqno = seq_no; + return 0; + +} + +static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, + enum ras_notify_event event_id, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev); + int ret = 0; + + switch (event_id) { + case RAS_EVENT_ID__BAD_PAGE_DETECTED: + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); + break; + case RAS_EVENT_ID__POISON_CONSUMPTION: + amdgpu_ras_sys_poison_consumption_event(ras_core, data); + break; + case RAS_EVENT_ID__RESERVE_BAD_PAGE: + ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t *)data); + break; + case RAS_EVENT_ID__FATAL_ERROR_DETECTED: + ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data); + break; + case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM: + ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, *(uint32_t *)data); + break; + case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP: + ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, *(uint32_t *)data); + break; + case RAS_EVENT_ID__DEVICE_RMA: + ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL); + ret = amdgpu_dpm_send_rma_reason(ras_core->dev); + break; + case RAS_EVENT_ID__RESET_GPU: + ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); + break; + default: + RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); + break; + } + + return ret; +} + +static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context *ras_core) +{ + return ktime_get_real_seconds(); +} + +static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core, + uint32_t *status) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint32_t gpu_status = 0; + + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) + gpu_status |= RAS_GPU_STATUS__IN_RESET; + + if (amdgpu_sriov_vf(adev)) + gpu_status |= RAS_GPU_STATUS__IS_VF; + + *status = gpu_status; + + return 0; +} + +static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context *ras_core, + struct device_system_info *dev_info) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + + dev_info->device_id = adev->pdev->device; + dev_info->vendor_id = adev->pdev->vendor; + dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev); + + return 0; +} + +static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core, + bool down, bool try) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret = 0; + + if (down && try) + ret = down_read_trylock(&adev->reset_domain->sem); + else if (down) + down_read(&adev->reset_domain->sem); + else + up_read(&adev->reset_domain->sem); + + return ret; +} + +static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context *ras_core) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct psp_context *psp = &adev->psp; + struct psp_ring *psp_ring; + struct ta_mem_context *mem_ctx; + + if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) { + psp_ring = &psp->km_ring; + gpu_mem->mem_bo = adev->firmware.rbuf; + gpu_mem->mem_size = psp_ring->ring_size; + gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr; + gpu_mem->mem_cpu_addr = psp_ring->ring_mem; + } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) { + gpu_mem->mem_bo = psp->cmd_buf_bo; + gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE; + gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr; + gpu_mem->mem_cpu_addr = psp->cmd_buf_mem; + } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) { + gpu_mem->mem_bo = psp->fence_buf_bo; + gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE; + gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr; + gpu_mem->mem_cpu_addr = psp->fence_buf; + } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) { + gpu_mem->mem_bo = psp->fw_pri_bo; + gpu_mem->mem_size = PSP_1_MEG; + gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr; + gpu_mem->mem_cpu_addr = psp->fw_pri_buf; + } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) { + mem_ctx = &psp->ras_context.context.mem_context; + gpu_mem->mem_bo = mem_ctx->shared_bo; + gpu_mem->mem_size = mem_ctx->shared_mem_size; + gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr; + gpu_mem->mem_cpu_addr = mem_ctx->shared_buf; + } else { + return -EINVAL; + } + + if (!gpu_mem->mem_bo || !gpu_mem->mem_size || + !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) { + RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is invalid!\n"); + return -ENOMEM; + } + + return 0; +} + +static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + + return 0; +} + +const struct ras_sys_func amdgpu_ras_sys_fn = { + .ras_notifier = amdgpu_ras_sys_event_notifier, + .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp, + .gen_seqno = amdgpu_ras_sys_gen_seqno, + .check_gpu_status = amdgpu_ras_sys_check_gpu_status, + .get_device_system_info = amdgpu_ras_sys_get_device_system_info, + .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock, + .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt, + .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem, + .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem, +}; diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h new file mode 100644 index 000000000000..c48ff26525d6 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_SYS_H__ +#define __RAS_SYS_H__ +#include <linux/stdarg.h> +#include <linux/printk.h> +#include <linux/dev_printk.h> +#include "amdgpu.h" + +#define RAS_DEV_ERR(device, fmt, ...) \ + do { \ + if (device) \ + dev_err(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_ERR fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_WARN(device, fmt, ...) \ + do { \ + if (device) \ + dev_warn(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_WARNING fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_INFO(device, fmt, ...) \ + do { \ + if (device) \ + dev_info(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_INFO fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_DBG(device, fmt, ...) \ + do { \ + if (device) \ + dev_dbg(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_INFO(fmt, ...) printk(KERN_INFO fmt, ##__VA_ARGS__) + +#define RAS_DEV_RREG32_SOC15(dev, ip, inst, reg) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, \ + 0, ip##_HWIP, inst); \ +}) + +#define RAS_DEV_WREG32_SOC15(dev, ip, inst, reg, value) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg), \ + value, 0, ip##_HWIP, inst); \ +}) + +/* GET_INST returns the physical instance corresponding to a logical instance */ +#define RAS_GET_INST(dev, ip, inst) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + adev->ip_map.logical_to_dev_inst ? \ + adev->ip_map.logical_to_dev_inst(adev, ip##_HWIP, inst) : inst; \ +}) + +#define RAS_GET_MASK(dev, ip, mask) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + (adev->ip_map.logical_to_dev_mask ? \ + adev->ip_map.logical_to_dev_mask(adev, ip##_HWIP, mask) : mask); \ +}) + +static inline void *ras_radix_tree_delete_iter(struct radix_tree_root *root, void *iter) +{ + return radix_tree_delete(root, ((struct radix_tree_iter *)iter)->index); +} + +static inline long ras_wait_event_interruptible_timeout(void *wq_head, + int (*condition)(void *param), void *param, unsigned int timeout) +{ + return wait_event_interruptible_timeout(*(wait_queue_head_t *)wq_head, + condition(param), timeout); +} + +extern const struct ras_sys_func amdgpu_ras_sys_fn; + +#endif -- 2.34.1