ras: Add amdgpu ras system functions

Alex Deucher Thu, 18 Sep 2025 16:10:11 -0700

On Wed, Sep 17, 2025 at 9:37 PM YiPeng Chai <yipeng.c...@amd.com> wrote:
>
> Add amdgpu ras system functions.
>
> Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
> Reviewed-by: Tao Zhou <tao.zh...@amd.com>
> ---
>  .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c  | 268 ++++++++++++++++++
>  drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h     | 109 +++++++
>  2 files changed, 377 insertions(+)
>  create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
>  create mode 100644 drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
>
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c 
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> new file mode 100644
> index 000000000000..40071b876333
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> @@ -0,0 +1,268 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright 2025 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include "ras_sys.h"
> +#include "amdgpu_ras_mgr.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
> +
> +static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context 
> *ras_core, void *data)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       uint64_t seq_no;
> +
> +       seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE);
> +       RAS_DEV_INFO(adev,
> +               "{%llu} Uncorrectable hardware 
> error(ERREVENT_ATHUB_INTERRUPT) detected!\n",
> +               seq_no);
> +
> +       return amdgpu_ras_process_handle_unexpected_interrupt(adev, data);
> +}
> +
> +static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context 
> *ras_core,
> +                               void *data)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       struct ras_event_req *req = (struct ras_event_req *)data;
> +       pasid_notify pasid_fn;
> +
> +       if (!req)
> +               return -EINVAL;
> +
> +       if (req->pasid_fn) {
> +               pasid_fn = (pasid_notify)req->pasid_fn;
> +               pasid_fn(adev, req->pasid, req->data);
> +       }
> +
> +       return 0;
> +}
> +
> +static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core,
> +                       enum ras_seqno_type seqno_type, uint64_t *seqno)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
> +       struct ras_event_manager *event_mgr;
> +       struct ras_event_state *event_state;
> +       struct amdgpu_hive_info *hive;
> +       enum ras_event_type event_type;
> +       uint64_t seq_no;
> +
> +       if (!ras_mgr || !seqno ||
> +               (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
> +               return -EINVAL;
> +
> +       switch (seqno_type) {
> +       case RAS_SEQNO_TYPE_UE:
> +               event_type = RAS_EVENT_TYPE_FATAL;
> +               break;
> +       case RAS_SEQNO_TYPE_CE:
> +       case RAS_SEQNO_TYPE_DE:
> +               event_type = RAS_EVENT_TYPE_POISON_CREATION;
> +               break;
> +       case RAS_SEQNO_TYPE_POISON_CONSUMPTION:
> +               event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
> +               break;
> +       default:
> +               event_type = RAS_EVENT_TYPE_INVALID;
> +               break;
> +       }
> +
> +       hive = amdgpu_get_xgmi_hive(adev);
> +       event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
> +       event_state = &event_mgr->event_state[event_type];
> +       if ((event_type == RAS_EVENT_TYPE_FATAL) && 
> amdgpu_ras_in_recovery(adev)) {
> +               seq_no = event_state->last_seqno;
> +       } else {
> +               seq_no = atomic64_inc_return(&event_mgr->seqno);
> +               event_state->last_seqno = seq_no;
> +               atomic64_inc(&event_state->count);
> +       }
> +       amdgpu_put_xgmi_hive(hive);
> +
> +       *seqno = seq_no;
> +       return 0;
> +
> +}
> +
> +static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
> +                                  enum ras_notify_event event_id, void *data)
> +{
> +       struct amdgpu_ras_mgr *ras_mgr = 
> amdgpu_ras_mgr_get_context(ras_core->dev);
> +       int ret = 0;
> +
> +       switch (event_id) {
> +       case RAS_EVENT_ID__BAD_PAGE_DETECTED:
> +               schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
> +               break;
> +       case RAS_EVENT_ID__POISON_CONSUMPTION:
> +               amdgpu_ras_sys_poison_consumption_event(ras_core, data);
> +               break;
> +       case RAS_EVENT_ID__RESERVE_BAD_PAGE:
> +               ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t 
> *)data);
> +               break;
> +       case RAS_EVENT_ID__FATAL_ERROR_DETECTED:
> +               ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data);
> +               break;
> +       case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM:
> +               ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, 
> *(uint32_t *)data);
> +               break;
> +       case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP:
> +               ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, 
> *(uint32_t *)data);
> +               break;
> +       case RAS_EVENT_ID__DEVICE_RMA:
> +               ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, 
> NULL);
> +               ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
> +               break;
> +       case RAS_EVENT_ID__RESET_GPU:
> +               ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t 
> *)data);
> +               break;
> +       default:
> +               RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", 
> event_id);
> +               break;
> +       }
> +
> +       return ret;
> +}
> +
> +static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context 
> *ras_core)
> +{
> +       return ktime_get_real_seconds();
> +}
> +
> +static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core,
> +                               uint32_t *status)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       uint32_t gpu_status = 0;
> +
> +       if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
> +               gpu_status |= RAS_GPU_STATUS__IN_RESET;
> +
> +       if (amdgpu_sriov_vf(adev))
> +               gpu_status |= RAS_GPU_STATUS__IS_VF;
> +
> +       *status = gpu_status;
> +
> +       return 0;
> +}
> +
> +static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context 
> *ras_core,
> +                       struct device_system_info *dev_info)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +
> +       dev_info->device_id = adev->pdev->device;
> +       dev_info->vendor_id = adev->pdev->vendor;
> +       dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev);
> +
> +       return 0;
> +}
> +
> +static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core,
> +                       bool down, bool try)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       int ret = 0;
> +
> +       if (down && try)
> +               ret = down_read_trylock(&adev->reset_domain->sem);
> +       else if (down)
> +               down_read(&adev->reset_domain->sem);
> +       else
> +               up_read(&adev->reset_domain->sem);
> +
> +       return ret;
> +}
> +
> +static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context 
> *ras_core)
> +{
> +       return !!atomic_read(&amdgpu_ras_in_intr);
> +}
> +
> +static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core,
> +       enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +       struct psp_context *psp = &adev->psp;
> +       struct psp_ring *psp_ring;
> +       struct ta_mem_context *mem_ctx;
> +
> +       if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) {
> +               psp_ring = &psp->km_ring;
> +               gpu_mem->mem_bo = adev->firmware.rbuf;
> +               gpu_mem->mem_size = psp_ring->ring_size;
> +               gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr;
> +               gpu_mem->mem_cpu_addr = psp_ring->ring_mem;
> +       } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) {
> +               gpu_mem->mem_bo = psp->cmd_buf_bo;
> +               gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE;
> +               gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr;
> +               gpu_mem->mem_cpu_addr = psp->cmd_buf_mem;
> +       } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) {
> +               gpu_mem->mem_bo = psp->fence_buf_bo;
> +               gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE;
> +               gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr;
> +               gpu_mem->mem_cpu_addr = psp->fence_buf;
> +       } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) {
> +               gpu_mem->mem_bo = psp->fw_pri_bo;
> +               gpu_mem->mem_size = PSP_1_MEG;
> +               gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr;
> +               gpu_mem->mem_cpu_addr = psp->fw_pri_buf;
> +       } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) {
> +               mem_ctx = &psp->ras_context.context.mem_context;
> +               gpu_mem->mem_bo = mem_ctx->shared_bo;
> +               gpu_mem->mem_size = mem_ctx->shared_mem_size;
> +               gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr;
> +               gpu_mem->mem_cpu_addr = mem_ctx->shared_buf;
> +       } else {
> +               return -EINVAL;
> +       }
> +
> +       if (!gpu_mem->mem_bo || !gpu_mem->mem_size ||
> +               !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) {
> +               RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is 
> invalid!\n");
> +               return -ENOMEM;
> +       }
> +
> +       return 0;
> +}
> +
> +static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core,
> +       enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
> +{
> +
> +       return 0;
> +}
> +
> +const struct ras_sys_func amdgpu_ras_sys_fn = {
> +       .ras_notifier = amdgpu_ras_sys_event_notifier,
> +       .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp,
> +       .gen_seqno = amdgpu_ras_sys_gen_seqno,
> +       .check_gpu_status = amdgpu_ras_sys_check_gpu_status,
> +       .get_device_system_info = amdgpu_ras_sys_get_device_system_info,
> +       .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock,
> +       .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
> +       .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
> +       .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
> +};
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h 
> b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> new file mode 100644
> index 000000000000..c48ff26525d6
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> @@ -0,0 +1,109 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright 2025 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __RAS_SYS_H__
> +#define __RAS_SYS_H__
> +#include <linux/stdarg.h>
> +#include <linux/printk.h>
> +#include <linux/dev_printk.h>
> +#include "amdgpu.h"
> +
> +#define RAS_DEV_ERR(device, fmt, ...)                                        
>        \
> +       do {                                                                  
>     \
> +               if (device)                                                   
>           \
> +                       dev_err(((struct amdgpu_device *)device)->dev, fmt, 
> ##__VA_ARGS__); \
> +               else                                                          
>         \
> +                       printk(KERN_ERR fmt, ##__VA_ARGS__);                  
>             \
> +       } while (0)
> +
> +#define RAS_DEV_WARN(device, fmt, ...)                                       
>         \
> +       do {                                                                  
>      \
> +               if (device)                                                   
>            \
> +                       dev_warn(((struct amdgpu_device *)device)->dev, fmt, 
> ##__VA_ARGS__); \
> +               else                                                          
>          \
> +                       printk(KERN_WARNING fmt, ##__VA_ARGS__);              
>              \
> +       } while (0)
> +
> +#define RAS_DEV_INFO(device, fmt, ...)                                       
>           \
> +       do {                                                                  
>        \
> +               if (device)                                                   
>              \
> +                       dev_info(((struct amdgpu_device *)device)->dev, fmt, 
> ##__VA_ARGS__);   \
> +               else                                                          
>            \
> +                       printk(KERN_INFO fmt, ##__VA_ARGS__);                 
>                \
> +       } while (0)
> +
> +#define RAS_DEV_DBG(device, fmt, ...)                                        
>           \
> +       do {                                                                  
>        \
> +               if (device)                                                   
>              \
> +                       dev_dbg(((struct amdgpu_device *)device)->dev, fmt, 
> ##__VA_ARGS__);    \
> +               else                                                          
>            \
> +                       printk(KERN_DEBUG fmt, ##__VA_ARGS__);                
>                \
> +       } while (0)
> +
> +#define RAS_INFO(fmt, ...)  printk(KERN_INFO fmt, ##__VA_ARGS__)


Why do we need these wrappers?  Is there ever a case where we don't
have a device?

Alex

> +
> +#define RAS_DEV_RREG32_SOC15(dev, ip, inst, reg) \
> +({ \
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> +       
> __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, 
> \
> +                        0, ip##_HWIP, inst); \
> +})
> +
> +#define RAS_DEV_WREG32_SOC15(dev, ip, inst, reg, value) \
> +({ \
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> +       
> __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + 
> reg), \
> +                         value, 0, ip##_HWIP, inst); \
> +})
> +
> +/* GET_INST returns the physical instance corresponding to a logical 
> instance */
> +#define RAS_GET_INST(dev, ip, inst) \
> +({ \
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> +       adev->ip_map.logical_to_dev_inst ? \
> +               adev->ip_map.logical_to_dev_inst(adev, ip##_HWIP, inst) : 
> inst; \
> +})
> +
> +#define RAS_GET_MASK(dev, ip, mask) \
> +({ \
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
> +       (adev->ip_map.logical_to_dev_mask ? \
> +               adev->ip_map.logical_to_dev_mask(adev, ip##_HWIP, mask) : 
> mask); \
> +})
> +
> +static inline void *ras_radix_tree_delete_iter(struct radix_tree_root *root, 
> void *iter)
> +{
> +       return radix_tree_delete(root, ((struct radix_tree_iter 
> *)iter)->index);
> +}
> +
> +static inline long ras_wait_event_interruptible_timeout(void *wq_head,
> +                       int (*condition)(void *param), void *param, unsigned 
> int timeout)
> +{
> +       return wait_event_interruptible_timeout(*(wait_queue_head_t *)wq_head,
> +                               condition(param), timeout);
> +}
> +
> +extern const struct ras_sys_func amdgpu_ras_sys_fn;
> +
> +#endif
> --
> 2.34.1
>

Re: [PATCH 06/10] drm/amd/ras: Add amdgpu ras system functions

Reply via email to