Add thread to handle ras events. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> Reviewed-by: Tao Zhou <tao.zh...@amd.com> Reviewed-by: Hawking Zhang <hawking.zh...@amd.com> --- drivers/gpu/drm/amd/ras/rascore/ras_process.c | 315 ++++++++++++++++++ drivers/gpu/drm/amd/ras/rascore/ras_process.h | 53 +++ 2 files changed, 368 insertions(+) create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_process.c create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_process.h
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c new file mode 100644 index 000000000000..02f0657f78a3 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_process.h" + +#define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req)) + +#define RAS_POLLING_ECC_TIMEOUT 300 + +static int ras_process_put_event(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = kfifo_in_spinlocked(&ras_proc->event_fifo, + req, sizeof(*req), &ras_proc->fifo_spinlock); + if (!ret) { + RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n"); + return -ENOSPC; + } + + return 0; +} + +static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core, + uint32_t reset_cause) +{ + struct ras_event_req req = {0}; + + req.reset = reset_cause; + + return ras_process_put_event(ras_core, &req); +} + +static int ras_process_get_event(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + return kfifo_out_spinlocked(&ras_proc->event_fifo, + req, sizeof(*req), &ras_proc->fifo_spinlock); +} + +static void ras_process_clear_event_fifo(struct ras_core_context *ras_core) +{ + struct ras_event_req req; + int ret; + + do { + ret = ras_process_get_event(ras_core, &req); + } while (ret); +} + +#define AMDGPU_RAS_WAITING_DATA_READY 200 +static int ras_process_umc_event(struct ras_core_context *ras_core, + uint32_t event_count) +{ + struct ras_ecc_count ecc_data; + int ret = 0; + uint32_t timeout = 0; + uint32_t detected_de_count = 0; + + do { + memset(&ecc_data, 0, sizeof(ecc_data)); + ret = ras_core_update_ecc_info(ras_core); + if (ret) + return ret; + + ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data); + if (ret) + return ret; + + if (ecc_data.new_de_count) { + detected_de_count += ecc_data.new_de_count; + timeout = 0; + } else { + if (!timeout && event_count) + timeout = AMDGPU_RAS_WAITING_DATA_READY; + + if (timeout) { + if (!--timeout) + break; + + msleep(1); + } + } + } while (detected_de_count < event_count); + + if (detected_de_count && ras_core_gpu_is_rma(ras_core)) + ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA); + + return 0; +} + +static int ras_process_non_umc_event(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + struct ras_event_req req; + uint32_t event_count = kfifo_len(&ras_proc->event_fifo); + uint32_t reset_flags = 0; + int ret = 0, i; + + for (i = 0; i < event_count; i++) { + memset(&req, 0, sizeof(req)); + ret = ras_process_get_event(ras_core, &req); + if (!ret) + continue; + + ras_core_event_notify(ras_core, + RAS_EVENT_ID__POISON_CONSUMPTION, &req); + + reset_flags |= req.reset; + + if (req.reset == GPU_RESET_CAUSE_RMA) + continue; + + if (req.reset) + RAS_DEV_INFO(ras_core->dev, + "{%llu} GPU reset for %s RAS poison consumption is issued!\n", + req.seqno, ras_core_get_ras_block_name(req.block)); + else + RAS_DEV_INFO(ras_core->dev, + "{%llu} %s RAS poison consumption is issued!\n", + req.seqno, ras_core_get_ras_block_name(req.block)); + } + + if (reset_flags) { + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__RESET_GPU, &reset_flags); + if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA)) + return -RAS_CORE_GPU_IN_MODE1_RESET; + } + + return ret; +} + +int ras_process_handle_ras_event(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + uint32_t umc_event_count; + int ret; + + ras_aca_clear_fatal_flag(ras_core); + ras_umc_log_pending_bad_bank(ras_core); + + do { + umc_event_count = atomic_read(&ras_proc->umc_interrupt_count); + ret = ras_process_umc_event(ras_core, umc_event_count); + if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) + break; + + if (umc_event_count) + atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count); + } while (atomic_read(&ras_proc->umc_interrupt_count)); + + if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) && + (kfifo_len(&ras_proc->event_fifo))) + ret = ras_process_non_umc_event(ras_core); + + if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) { + /* Clear poison fifo */ + ras_process_clear_event_fifo(ras_core); + atomic_set(&ras_proc->umc_interrupt_count, 0); + } + + return ret; +} + +static int thread_wait_condition(void *param) +{ + struct ras_process *ras_proc = (struct ras_process *)param; + + return (kthread_should_stop() || + atomic_read(&ras_proc->ras_interrupt_req)); +} + +static int ras_process_thread(void *context) +{ + struct ras_core_context *ras_core = (struct ras_core_context *)context; + struct ras_process *ras_proc = &ras_core->ras_proc; + + while (!kthread_should_stop()) { + ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq, + thread_wait_condition, ras_proc, + msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT)); + + if (kthread_should_stop()) + break; + + if (!ras_core->is_initialized) + continue; + + atomic_set(&ras_proc->ras_interrupt_req, 0); + + if (ras_core_gpu_in_reset(ras_core)) + continue; + + if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event) + ras_core->sys_fn->async_handle_ras_event(ras_core, NULL); + else + ras_process_handle_ras_event(ras_core); + } + + return 0; +} + +int ras_process_init(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL); + if (ret) + return ret; + + spin_lock_init(&ras_proc->fifo_spinlock); + + init_waitqueue_head(&ras_proc->ras_process_wq); + + ras_proc->ras_process_thread = kthread_run(ras_process_thread, + (void *)ras_core, "ras_process_thread"); + if (!ras_proc->ras_process_thread) { + RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n"); + ret = -ENOMEM; + goto err; + } + + return 0; + +err: + ras_process_fini(ras_core); + return ret; +} + +int ras_process_fini(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + if (ras_proc->ras_process_thread) { + kthread_stop(ras_proc->ras_process_thread); + ras_proc->ras_process_thread = NULL; + } + + kfifo_free(&ras_proc->event_fifo); + + return 0; +} + +static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + atomic_inc(&ras_proc->umc_interrupt_count); + atomic_inc(&ras_proc->ras_interrupt_req); + + wake_up(&ras_proc->ras_process_wq); + return 0; +} + +static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = ras_process_put_event(ras_core, req); + if (!ret) { + atomic_inc(&ras_proc->ras_interrupt_req); + wake_up(&ras_proc->ras_process_wq); + } + + return ret; +} + +int ras_process_add_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req, bool is_umc) +{ + int ret; + + if (!ras_core) + return -EINVAL; + + if (!ras_core->is_initialized) + return -EPERM; + + if (is_umc) + ret = ras_process_add_umc_interrupt_req(ras_core, req); + else + ret = ras_process_add_non_umc_interrupt_req(ras_core, req); + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.h b/drivers/gpu/drm/amd/ras/rascore/ras_process.h new file mode 100644 index 000000000000..28458b50510e --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_PROCESS_H__ +#define __RAS_PROCESS_H__ + +struct ras_event_req { + uint64_t seqno; + uint32_t idx_vf; + uint32_t block; + uint16_t pasid; + uint32_t reset; + void *pasid_fn; + void *data; +}; + +struct ras_process { + void *dev; + void *ras_process_thread; + wait_queue_head_t ras_process_wq; + atomic_t ras_interrupt_req; + atomic_t umc_interrupt_count; + struct kfifo event_fifo; + spinlock_t fifo_spinlock; +}; + +struct ras_core_context; +int ras_process_init(struct ras_core_context *ras_core); +int ras_process_fini(struct ras_core_context *ras_core); +int ras_process_handle_ras_event(struct ras_core_context *ras_core); +int ras_process_add_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req, bool is_umc); +#endif -- 2.34.1