From: Riana Tauro <[email protected]> DO NOT REVIEW. COMPILATION ONLY This patch is from https://patchwork.freedesktop.org/series/160482/ Added only for Compilation.
Signed-off-by: Riana Tauro <[email protected]> Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_device.c | 5 +- drivers/gpu/drm/xe/xe_device.h | 27 +- drivers/gpu/drm/xe/xe_device_types.h | 12 +- drivers/gpu/drm/xe/xe_gt.c | 14 +- drivers/gpu/drm/xe/xe_guc_submit.c | 9 +- drivers/gpu/drm/xe/xe_pci.c | 9 + drivers/gpu/drm/xe/xe_pci_error.c | 135 +++++ drivers/gpu/drm/xe/xe_pci_error.h | 13 + drivers/gpu/drm/xe/xe_ras.c | 481 +++++++++++++++++- drivers/gpu/drm/xe/xe_ras.h | 3 +- drivers/gpu/drm/xe/xe_ras_types.h | 231 ++++++++- drivers/gpu/drm/xe/xe_survivability_mode.c | 13 +- drivers/gpu/drm/xe/xe_sysctrl_event.c | 2 +- drivers/gpu/drm/xe/xe_sysctrl_event_types.h | 3 - drivers/gpu/drm/xe/xe_sysctrl_mailbox.h | 1 - drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h | 11 + 17 files changed, 933 insertions(+), 37 deletions(-) create mode 100644 drivers/gpu/drm/xe/xe_pci_error.c create mode 100644 drivers/gpu/drm/xe/xe_pci_error.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 8e7b146880f4..3c001b2a4aec 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -101,6 +101,7 @@ xe-y += xe_bb.o \ xe_page_reclaim.o \ xe_pat.o \ xe_pci.o \ + xe_pci_error.o \ xe_pci_rebar.o \ xe_pcode.o \ xe_pm.o \ diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index b6e49309a99f..46d27b6f3eea 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -918,7 +918,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg) { struct xe_device *xe = arg; - if (atomic_read(&xe->wedged.flag)) + if (atomic_read(&xe->wedged.fini)) xe_pm_runtime_put(xe); } @@ -1425,7 +1425,8 @@ void xe_device_declare_wedged(struct xe_device *xe) return; } - if (!atomic_xchg(&xe->wedged.flag, 1)) { + if (!atomic_xchg(&xe->wedged.fini, 1)) { + xe_device_wedged_get(xe); xe->needs_flr_on_fini = true; xe_pm_runtime_get_noresume(xe); drm_err(&xe->drm, diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 975768a6a9c8..e177c05a7a95 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -181,6 +181,21 @@ static inline bool xe_device_has_mert(const struct xe_device *xe) return xe->info.has_mert; } +static inline bool xe_device_is_in_reset(struct xe_device *xe) +{ + return atomic_read(&xe->in_reset); +} + +static inline void xe_device_set_in_reset(struct xe_device *xe) +{ + atomic_set(&xe->in_reset, 1); +} + +static inline void xe_device_clear_in_reset(struct xe_device *xe) +{ + atomic_set(&xe->in_reset, 0); +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size); void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); @@ -192,9 +207,19 @@ bool xe_device_is_l2_flush_optimized(struct xe_device *xe); void xe_device_td_flush(struct xe_device *xe); void xe_device_l2_flush(struct xe_device *xe); +static inline void xe_device_wedged_get(struct xe_device *xe) +{ + atomic_inc(&xe->wedged.ref); +} + +static inline void xe_device_wedged_put(struct xe_device *xe) +{ + atomic_dec(&xe->wedged.ref); +} + static inline bool xe_device_wedged(struct xe_device *xe) { - return atomic_read(&xe->wedged.flag); + return atomic_read(&xe->wedged.ref); } void xe_device_set_wedged_method(struct xe_device *xe, unsigned long method); diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 4e2f115f14e2..ddf7fd57da98 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -485,10 +485,15 @@ struct xe_device { /** @needs_flr_on_fini: requests function-reset on fini */ bool needs_flr_on_fini; + /** @in_reset: Indicates if device is in reset */ + atomic_t in_reset; + /** @wedged: Struct to control Wedged States and mode */ struct { - /** @wedged.flag: Xe device faced a critical error and is now blocked. */ - atomic_t flag; + /** @wedged.fini: Needs cleanup on fini */ + atomic_t fini; + /** @wedged.ref: Refcount for wedged device, blocks critical path execution */ + atomic_t ref; /** @wedged.mode: Mode controlled by kernel parameter and debugfs */ enum xe_wedged_mode mode; /** @wedged.method: Recovery method to be sent in the drm device wedged uevent */ @@ -497,6 +502,9 @@ struct xe_device { bool inconsistent_reset; } wedged; + /** @devres_group_id: id for devres group */ + void *devres_group_id; + /** @bo_device: Struct to control async free of BOs */ struct xe_bo_dev { /** @bo_device.async_free: Free worker */ diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 783eb6d631b5..d904527a8898 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -917,6 +917,9 @@ static void gt_reset_worker(struct work_struct *w) if (xe_device_wedged(gt_to_xe(gt))) goto err_pm_put; + if (xe_device_is_in_reset(gt_to_xe(gt))) + goto err_pm_put; + /* We only support GT resets with GuC submission */ if (!xe_device_uc_enabled(gt_to_xe(gt))) goto err_pm_put; @@ -977,18 +980,21 @@ static void gt_reset_worker(struct work_struct *w) void xe_gt_reset_async(struct xe_gt *gt) { - xe_gt_info(gt, "trying reset from %ps\n", __builtin_return_address(0)); + struct xe_device *xe = gt_to_xe(gt); + + if (xe_device_is_in_reset(xe)) + return; /* Don't do a reset while one is already in flight */ if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(>->uc)) return; - xe_gt_info(gt, "reset queued\n"); + xe_gt_info(gt, "reset queued from %ps\n", __builtin_return_address(0)); /* Pair with put in gt_reset_worker() if work is enqueued */ - xe_pm_runtime_get_noresume(gt_to_xe(gt)); + xe_pm_runtime_get_noresume(xe); if (!queue_work(gt->ordered_wq, >->reset.worker)) - xe_pm_runtime_put(gt_to_xe(gt)); + xe_pm_runtime_put(xe); } void xe_gt_suspend_prepare(struct xe_gt *gt) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 9458bf477fa6..12416bfa3255 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1532,7 +1532,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * If devcoredump not captured and GuC capture for the job is not ready * do manual capture first and decide later if we need to use it */ - if (!exec_queue_killed(q) && !xe->devcoredump.captured && + if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q) && !xe->devcoredump.captured && !xe_guc_capture_get_matching_and_lock(q)) { /* take force wake before engine register manual capture */ CLASS(xe_force_wake, fw_ref)(gt_to_fw(q->gt), XE_FORCEWAKE_ALL); @@ -1554,8 +1554,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) set_exec_queue_banned(q); /* Kick job / queue off hardware */ - if (!wedged && (exec_queue_enabled(primary) || - exec_queue_pending_disable(primary))) { + if (!xe_device_is_in_reset(xe) && !wedged && + (exec_queue_enabled(primary) || exec_queue_pending_disable(primary))) { int ret; if (exec_queue_reset(primary)) @@ -1623,7 +1623,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) trace_xe_sched_job_timedout(job); - if (!exec_queue_killed(q)) + /* Do not access device if in reset */ + if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q)) xe_devcoredump(q, job, "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx", xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 9c249454cc95..31097725b108 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -26,6 +26,7 @@ #include "xe_guc.h" #include "xe_mmio.h" #include "xe_module.h" +#include "xe_pci_error.h" #include "xe_pci_rebar.h" #include "xe_pci_sriov.h" #include "xe_pci_types.h" @@ -1079,6 +1080,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) const struct xe_device_desc *desc = (const void *)ent->driver_data; const struct xe_subplatform_desc *subplatform_desc; struct xe_device *xe; + void *devres_id; int err; subplatform_desc = find_subplatform(desc, pdev->device); @@ -1106,6 +1108,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (xe_display_driver_probe_defer(pdev)) return -EPROBE_DEFER; + devres_id = devres_open_group(&pdev->dev, NULL, GFP_KERNEL); + if (!devres_id) + return -ENOMEM; + err = pcim_enable_device(pdev); if (err) return err; @@ -1114,6 +1120,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (IS_ERR(xe)) return PTR_ERR(xe); + xe->devres_group_id = devres_id; + pci_set_drvdata(pdev, &xe->drm); xe_pm_assert_unbounded_bridge(xe); @@ -1352,6 +1360,7 @@ static struct pci_driver xe_pci_driver = { .remove = xe_pci_remove, .shutdown = xe_pci_shutdown, .sriov_configure = xe_pci_sriov_configure, + .err_handler = &xe_pci_error_handlers, #ifdef CONFIG_PM_SLEEP .driver.pm = &xe_pm_ops, #endif diff --git a/drivers/gpu/drm/xe/xe_pci_error.c b/drivers/gpu/drm/xe/xe_pci_error.c new file mode 100644 index 000000000000..b08601f470d6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci_error.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include <linux/pci.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_pci.h" +#include "xe_printk.h" +#include "xe_ras.h" +#include "xe_survivability_mode.h" + +static void prepare_device_for_reset(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_gt *gt; + u8 id; + + xe_device_set_in_reset(xe); + + /* Wedge the device to prevent userspace access during reset */ + xe_device_wedged_get(xe); + + for_each_gt(gt, xe, id) + xe_gt_declare_wedged(gt); + + pci_disable_device(pdev); +} + +static pci_ers_result_t ras_action_to_pci_result(struct pci_dev *pdev, u32 action) +{ + switch (action) { + case XE_RAS_RECOVERY_ACTION_RECOVERED: + return PCI_ERS_RESULT_RECOVERED; + case XE_RAS_RECOVERY_ACTION_RESET: + prepare_device_for_reset(pdev); + return PCI_ERS_RESULT_NEED_RESET; + case XE_RAS_RECOVERY_ACTION_DISCONNECT: + return PCI_ERS_RESULT_DISCONNECT; + default: + return PCI_ERS_RESULT_DISCONNECT; + } +} + +static pci_ers_result_t xe_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: detected state = %u\n", state); + + if (state == pci_channel_io_perm_failure) + return PCI_ERS_RESULT_DISCONNECT; + + /* If the device is already wedged or in survivability mode, do not attempt recovery */ + if (xe_survivability_mode_is_boot_enabled(xe) || xe_device_wedged(xe)) + return PCI_ERS_RESULT_DISCONNECT; + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + prepare_device_for_reset(pdev); + return PCI_ERS_RESULT_NEED_RESET; + default: + xe_err(xe, "PCI error: unknown state %d\n", state); + return PCI_ERS_RESULT_NEED_RESET; + } +} + +static pci_ers_result_t xe_pci_error_mmio_enabled(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + enum xe_ras_recovery_action action; + + xe_err(xe, "PCI error: MMIO enabled\n"); + + action = xe_ras_process_errors(xe); + + return ras_action_to_pci_result(pdev, action); +} + +static pci_ers_result_t xe_pci_error_slot_reset(struct pci_dev *pdev) +{ + const struct pci_device_id *ent = pci_match_id(pdev->driver->id_table, pdev); + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: slot reset\n"); + + pci_restore_state(pdev); + + if (pci_enable_device(pdev)) { + xe_err(xe, "Cannot re-enable PCI device after reset\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + + /* + * Secondary Bus Reset causes all VRAM state to be lost along with + * hardware state. As an initial step, re-probe the device to + * re-initialize the driver and hardware. + * TODO: optimize by re-initializing only the hardware state and re-creating + * kernel BOs. + */ + xe_device_clear_in_reset(xe); + pdev->driver->remove(pdev); + devres_release_group(&pdev->dev, xe->devres_group_id); + + if (pdev->driver->probe(pdev, ent)) + return PCI_ERS_RESULT_DISCONNECT; + + xe = pdev_to_xe_device(pdev); + + /* Wedge the device to prevent I/O operations till the resume callback */ + xe_device_wedged_get(xe); + + return PCI_ERS_RESULT_RECOVERED; +} + +static void xe_pci_error_resume(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: resume\n"); + + /* Resume I/O operations */ + xe_device_wedged_put(xe); +} + +const struct pci_error_handlers xe_pci_error_handlers = { + .error_detected = xe_pci_error_detected, + .mmio_enabled = xe_pci_error_mmio_enabled, + .slot_reset = xe_pci_error_slot_reset, + .resume = xe_pci_error_resume, +}; diff --git a/drivers/gpu/drm/xe/xe_pci_error.h b/drivers/gpu/drm/xe/xe_pci_error.h new file mode 100644 index 000000000000..725ad0214e62 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci_error.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef _XE_PCI_ERROR_H_ +#define _XE_PCI_ERROR_H_ + +struct pci_error_handlers; + +extern const struct pci_error_handlers xe_pci_error_handlers; + +#endif diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 44f4e1a3455b..d6db0e98586f 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -3,17 +3,23 @@ * Copyright © 2026 Intel Corporation */ +#include "xe_assert.h" +#include "xe_bo.h" #include "xe_device.h" #include "xe_drm_ras.h" #include "xe_pm.h" #include "xe_printk.h" #include "xe_ras.h" #include "xe_ras_types.h" +#include "xe_survivability_mode.h" #include "xe_sysctrl.h" #include "xe_sysctrl_event_types.h" #include "xe_sysctrl_mailbox.h" #include "xe_sysctrl_mailbox_types.h" +#define CORE_COMPUTE_UNCORR_TYPE GENMASK(26, 25) +#define GLOBAL_UNCORR_ERROR 2 + /* Severity of detected errors */ enum xe_ras_severity { XE_RAS_SEV_NOT_SUPPORTED = 0, @@ -35,17 +41,6 @@ enum xe_ras_component { XE_RAS_COMP_MAX }; -/* RAS response status codes */ -enum xe_ras_response_status { - XE_RAS_STATUS_SUCCESS = 0, - XE_RAS_STATUS_INVALID_PARAM, - XE_RAS_STATUS_OP_NOT_SUPPORTED, - XE_RAS_STATUS_TIMEOUT, - XE_RAS_STATUS_HARDWARE_FAILURE, - XE_RAS_STATUS_INSUFFICIENT_RESOURCES, - XE_RAS_STATUS_MAX -}; - static const char *const xe_ras_severities[] = { [XE_RAS_SEV_NOT_SUPPORTED] = "Not Supported", [XE_RAS_SEV_CORRECTABLE] = "Correctable Error", @@ -131,6 +126,305 @@ static inline const char *comp_to_str(u8 component) return xe_ras_components[component]; } +static int send_page_offline(struct xe_device *xe, enum xe_ras_page_action action, u64 page_address) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_page_offline_request request = {0}; + struct xe_ras_page_offline_response response = {0}; + size_t rlen; + int ret; + + if (!xe->info.has_sysctrl) + return 0; + + if (action >= XE_RAS_PAGE_ACTION_MAX) { + xe_err(xe, "[RAS]: Invalid page offline action %d\n", action); + return -EINVAL; + } + + request.page_address = page_address; + request.action = action; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_PAGE_OFFLINE, + &request, sizeof(request), &response, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to send page offline command %d\n", ret); + return ret; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return -EINVAL; + } + + ret = ras_status_to_errno(response.status); + if (ret) { + xe_err(xe, "sysctrl: page offline command failed with status %d\n", + response.status); + } + + return ret; +} + +static int handle_page_offline(struct xe_device *xe, u64 page_address, bool send_offline_cmd) +{ + enum xe_ras_page_action action; + int ret = 0; + + if (!IS_ALIGNED(page_address, XE_PAGE_SIZE)) { + xe_err(xe, "sysctrl: Unaligned page address: 0x%llx\n", page_address); + return -EINVAL; + } + + /* + * TODO: Call function to handle address fault + * ret = xe_ttm_vram_handle_addr_fault(xe, page_address); + */ + + /* + * Handle return code from address fault handling function: + * 0: Address is valid and can be offlined + * -EIO: Address belongs to a critical BO that cannot be offlined + * -EOPNOTSUPP: Address is valid and can be offlined but user policy is not to offline + * + * For any other non-zero error code, skip offlining. + */ + + switch (ret) { + case 0: + action = XE_RAS_PAGE_ACTION_OFFLINE; + break; + /* User policy set to decline page offlining */ + case -EOPNOTSUPP: + action = XE_RAS_PAGE_ACTION_DECLINE; + break; + case -EIO: + xe_err(xe, "[RAS]: Page address belongs to critical BO: 0x%llx\n", + page_address); + return ret; + default: + xe_err(xe, "[RAS]: Failed to handle address fault 0x%llx: %d\n", + page_address, ret); + return 0; + } + + if (send_offline_cmd) { + ret = send_page_offline(xe, action, page_address); + if (ret) + xe_err(xe, "sysctrl: Failed to offline page for address 0x%llx: %d\n", + page_address, ret); + return ret; + } + + return 0; +} + +static enum xe_ras_recovery_action handle_core_compute_errors(struct xe_ras_error_array *arr) +{ + struct xe_ras_compute_error *error_info = (void *)arr->details; + u8 uncorr_type; + + uncorr_type = FIELD_GET(CORE_COMPUTE_UNCORR_TYPE, error_info->log_header); + + /* Request a reset if error is global */ + if (uncorr_type == GLOBAL_UNCORR_ERROR) + return XE_RAS_RECOVERY_ACTION_RESET; + + /* + * No action needed for other errors. + * Local errors are recovered using an engine reset by GuC. + */ + return XE_RAS_RECOVERY_ACTION_RECOVERED; +} + +static struct pci_dev *find_usp_dev(struct pci_dev *pdev) +{ + struct pci_dev *vsp; + + /* + * Device Hierarchy: + * + * Upstream Switch Port (USP) --> Virtual Switch Port (VSP) --> SGunit (GPU endpoint) + */ + vsp = pci_upstream_bridge(pdev); + if (!vsp) + return NULL; + + return pci_upstream_bridge(vsp); +} + +static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device *xe, + struct xe_ras_error_array *arr) +{ + struct xe_ras_soc_error *info = (void *)arr->details; + struct xe_ras_soc_error_source *source = &info->source; + struct xe_ras_error_class *counter = &arr->counter; + + if (source->csc) { + struct xe_ras_csc_error *csc_error = (void *)info->details; + + /* + * CSC uncorrectable errors are classified as hardware errors and firmware errors. + * CSC firmware errors are critical errors that can be recovered only by firmware + * update via SPI driver. On a CSC firmware error, PCODE enables FDO mode and sets + * the bit in the capability register. On receiving this error, the driver enables + * runtime survivability mode which notifies userspace that a firmware update + * is required. + */ + if (csc_error->hec_fw_error) { + xe_err(xe, "[RAS]: CSC %s detected: 0x%x\n", + sev_to_str(counter->common.severity), + csc_error->hec_fw_error); + xe_survivability_mode_runtime_enable(xe); + return XE_RAS_RECOVERY_ACTION_DISCONNECT; + } + } else if (source->ieh) { + struct xe_ras_ieh_error *ieh_error = (void *)info->details; + + if (ieh_error->global_error_status & XE_RAS_SOC_IEH_PUNIT) { + xe_err(xe, "[RAS]: PUNIT %s detected: 0x%x\n", + sev_to_str(counter->common.severity), + ieh_error->global_error_status); + /* TODO: Add PUNIT error handling */ + return XE_RAS_RECOVERY_ACTION_DISCONNECT; + } + } + + /* For other SOC internal errors, request a reset as recovery mechanism */ + return XE_RAS_RECOVERY_ACTION_RESET; +} + +static enum xe_ras_recovery_action handle_device_memory_errors(struct xe_device *xe, + struct xe_ras_error_array *arr) +{ + struct xe_ras_memory_error *info = (void *)arr->details; + int ret; + + /* + * For memory errors, the recovery action depends on the error category + * + * Double bit ECC: Page offline handling + * Poison and data parity errors: Log only + * For any other memory errors, request a reset as recovery mechanism + */ + switch (info->category) { + case XE_RAS_MEMORY_ECC: + xe_err(xe, "[RAS]: Double-bit ECC error detected at sw address 0x%llx\n", + info->sw_address); + ret = handle_page_offline(xe, info->sw_address, true); + if (ret) + return XE_RAS_RECOVERY_ACTION_RESET; + break; + case XE_RAS_MEMORY_POISON: + xe_info(xe, "[RAS]: Poison error detected\n"); + break; + case XE_RAS_MEMORY_DATA_PARITY: + xe_info(xe, "[RAS]: Data parity error detected\n"); + break; + default: + return XE_RAS_RECOVERY_ACTION_RESET; + } + + return XE_RAS_RECOVERY_ACTION_RECOVERED; +} + +static void get_queued_pages(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_page_offline_queue response = {0}; + u32 count = 0; + size_t rlen; + int ret, i; + + /* Supported only on platforms with system controller */ + if (!xe->info.has_sysctrl) + return; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, + XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE, NULL, 0, &response, + sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get page offline queue %d\n", ret); + return; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline queue response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return; + } + + for (i = 0; i < response.pages_returned && i < XE_RAS_NUM_PAGES; i++) + handle_page_offline(xe, response.page_addresses[i], true); + + count += response.pages_returned; + if (!response.pages_returned) + break; + + if (count > response.total_pages) { + xe_err(xe, "sysctrl: Pages returned from queue exceed total pages %u, returned %u\n", + response.total_pages, count); + return; + } + } while (response.additional_data); +} + +static void get_offlined_list(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_offline_list_response response = {0}; + struct xe_ras_offline_list_request request = {0}; + u32 count = 0; + size_t rlen; + int ret, i; + + /* Supported only on platforms with system controller */ + if (!xe->info.has_sysctrl) + return; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_GET_OFFLINE_LIST, + &request, sizeof(request), &response, sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + request.index = count; + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get page offline list %d\n", ret); + return; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline list response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return; + } + + for (i = 0; i < response.pages_returned && i < XE_RAS_NUM_PAGES; i++) + handle_page_offline(xe, response.page_addresses[i], false); + + count += response.pages_returned; + if (!response.pages_returned) + break; + + if (count > response.total_pages) { + xe_err(xe, "sysctrl: Pages returned from list exceed total pages %u, returned %u\n", + response.total_pages, count); + return; + } + } while (response.additional_data); +} + void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response) { @@ -270,16 +564,173 @@ int xe_ras_clear_counter(struct xe_device *xe, u8 severity, u8 component) return 0; } +static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + u32 aer_uncorr_mask, aer_uncorr_sev, aer_uncorr_status; + struct pci_dev *usp; + u16 aer_cap; + + usp = find_usp_dev(pdev); + if (!usp) + return; + + aer_cap = pci_find_ext_capability(usp, PCI_EXT_CAP_ID_ERR); + if (!aer_cap) { + dev_info(&usp->dev, "AER capability unavailable\n"); + return; + } + + /* + * Clear any stale Uncorrectable Internal Error Status event in Uncorrectable Error + * Status Register. + */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, &aer_uncorr_status); + if (aer_uncorr_status & PCI_ERR_UNC_INTN) + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_INTN); + + /* + * All errors are steered to USP which is a PCIe AER Compliant device. + * Downgrade all the errors to non-fatal to prevent PCIe bus driver + * from triggering a Secondary Bus Reset (SBR). This allows error + * detection, containment and recovery in the driver. + * + * The Uncorrectable Error Severity Register has the 'Uncorrectable + * Internal Error Severity' set to fatal by default. Set this to + * non-fatal and unmask the error. + */ + + /* Initialize Uncorrectable Error Severity Register */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, &aer_uncorr_sev); + aer_uncorr_sev &= ~PCI_ERR_UNC_INTN; + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, aer_uncorr_sev); + + /* Initialize Uncorrectable Error Mask Register */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, &aer_uncorr_mask); + aer_uncorr_mask &= ~PCI_ERR_UNC_INTN; + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, aer_uncorr_mask); + + pci_save_state(usp); + dev_dbg(&usp->dev, "Uncorrectable Internal Errors downgraded and unmasked\n"); +} + /** * xe_ras_init - Initialize Xe RAS * @xe: xe device instance * - * Register drm_ras nodes + * Initialize Xe RAS */ void xe_ras_init(struct xe_device *xe) { - if (!xe->info.has_drm_ras) - return; + if (xe->info.has_sysctrl) { + if (IS_ENABLED(CONFIG_PCIEAER)) + aer_unmask_and_downgrade_internal_error(xe); + + get_queued_pages(xe); + get_offlined_list(xe); + + /* + * During probe, process and log any errors detected by firmware + * while the driver was not loaded. Critical errors such as Punit + * and CSC are reported through Pcode init failure, causing the + * driver to enter survivability mode. + */ + xe_ras_process_errors(xe); + } + + if (xe->info.has_drm_ras) + xe_drm_ras_init(xe); +} + +/** + * xe_ras_process_errors() - Process and contain hardware errors + * @xe: xe device instance + * + * Get error details from system controller and return recovery + * method. Called only from PCI error handling. + * + * Returns: recovery action to be taken + */ +enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_get_soc_error response; + enum xe_ras_recovery_action final_action; + u32 remaining = XE_SYSCTRL_FLOOD_LIMIT; + size_t rlen; + int ret; - xe_drm_ras_init(xe); + if (!xe->info.has_sysctrl) + return XE_RAS_RECOVERY_ACTION_RESET; + + /* Default action */ + final_action = XE_RAS_RECOVERY_ACTION_RECOVERED; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_GET_SOC_ERROR, + NULL, 0, &response, sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get soc error %d\n", ret); + goto err; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected get soc error response length %zu (expected %zu)\n", + rlen, sizeof(response)); + goto err; + } + + /* Report if number of errors exceeds the maximum errors supported */ + if (response.num_errors > XE_RAS_NUM_ERROR_ARR) + xe_err(xe, "sysctrl: number of errors received %d out of bound (%d)\n", + response.num_errors, XE_RAS_NUM_ERROR_ARR); + + for (int i = 0; i < response.num_errors && i < XE_RAS_NUM_ERROR_ARR; i++) { + struct xe_ras_error_array *arr = &response.arr[i]; + enum xe_ras_recovery_action action; + u8 component, severity; + + component = arr->counter.common.component; + severity = arr->counter.common.severity; + + xe_err(xe, "[RAS]: %s %s detected\n", comp_to_str(component), + sev_to_str(severity)); + + switch (component) { + case XE_RAS_COMP_CORE_COMPUTE: + action = handle_core_compute_errors(arr); + break; + case XE_RAS_COMP_SOC_INTERNAL: + action = handle_soc_internal_errors(xe, arr); + break; + case XE_RAS_COMP_DEVICE_MEMORY: + action = handle_device_memory_errors(xe, arr); + break; + default: + /* For any other component, reset */ + action = XE_RAS_RECOVERY_ACTION_RESET; + break; + } + + /* Process and log all errors and then trigger highest recovery action */ + if (action > final_action) + final_action = action; + } + + /* Treat flooding as an system controller error */ + if (!--remaining) { + xe_err(xe, "[RAS]: sysctrl: get soc error response flooding\n"); + return XE_RAS_RECOVERY_ACTION_RESET; + } + + } while (response.additional_errors); + + return final_action; + +err: + return XE_RAS_RECOVERY_ACTION_RESET; } diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index ba0b0224df23..b85657b1dd97 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -6,7 +6,7 @@ #ifndef _XE_RAS_H_ #define _XE_RAS_H_ -#include <linux/types.h> +#include "xe_ras_types.h" struct xe_device; struct xe_sysctrl_event_response; @@ -16,5 +16,6 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, int xe_ras_get_counter(struct xe_device *xe, u8 severity, u8 component, u32 *value); int xe_ras_clear_counter(struct xe_device *xe, u8 severity, u8 component); void xe_ras_init(struct xe_device *xe); +enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe); #endif diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h index 6688e11f57a8..a041a8375730 100644 --- a/drivers/gpu/drm/xe/xe_ras_types.h +++ b/drivers/gpu/drm/xe/xe_ras_types.h @@ -8,7 +8,67 @@ #include <linux/types.h> +#define XE_RAS_NUM_ERROR_ARR 3 #define XE_RAS_NUM_COUNTERS 16 +#define XE_RAS_SOC_IEH_PUNIT BIT(1) +/* Device memory error categories */ +#define XE_RAS_MEMORY_ECC BIT(1) +#define XE_RAS_MEMORY_POISON BIT(2) +#define XE_RAS_MEMORY_DATA_PARITY BIT(5) + +#define XE_RAS_NUM_PAGES 25 + +/** + * enum xe_ras_recovery_action - RAS recovery actions + * + * @XE_RAS_RECOVERY_ACTION_RECOVERED: Error recovered + * @XE_RAS_RECOVERY_ACTION_RESET: Requires reset + * @XE_RAS_RECOVERY_ACTION_DISCONNECT: Requires disconnect + * @XE_RAS_RECOVERY_ACTION_MAX: Max action value + * + * This enum defines the possible recovery actions that can be taken in response + * to RAS errors. + */ +enum xe_ras_recovery_action { + XE_RAS_RECOVERY_ACTION_RECOVERED = 0, + XE_RAS_RECOVERY_ACTION_RESET, + XE_RAS_RECOVERY_ACTION_DISCONNECT, + XE_RAS_RECOVERY_ACTION_MAX +}; + +/** + * enum xe_ras_page_action - Page offline actions for page offline request + * + * @XE_RAS_PAGE_ACTION_OFFLINE: Instruct firmware to remove page from queue + * @XE_RAS_PAGE_ACTION_DECLINE: Instruct firmware to mark page as not offline + * @XE_RAS_PAGE_ACTION_MAX: Max value for validation + */ +enum xe_ras_page_action { + XE_RAS_PAGE_ACTION_OFFLINE, + XE_RAS_PAGE_ACTION_DECLINE, + XE_RAS_PAGE_ACTION_MAX +}; + +/** + * enum xe_ras_response_status - RAS response status codes + * + * @XE_RAS_STATUS_SUCCESS: Operation successful + * @XE_RAS_STATUS_INVALID_PARAM: Invalid parameter + * @XE_RAS_STATUS_OP_NOT_SUPPORTED: Operation not supported + * @XE_RAS_STATUS_TIMEOUT: Operation timed out + * @XE_RAS_STATUS_HARDWARE_FAILURE: Hardware failure + * @XE_RAS_STATUS_INSUFFICIENT_RESOURCES: Insufficient resources + * @XE_RAS_STATUS_UNKNOWN_ERROR: Unknown error + */ +enum xe_ras_response_status { + XE_RAS_STATUS_SUCCESS = 0, + XE_RAS_STATUS_INVALID_PARAM, + XE_RAS_STATUS_OP_NOT_SUPPORTED, + XE_RAS_STATUS_TIMEOUT, + XE_RAS_STATUS_HARDWARE_FAILURE, + XE_RAS_STATUS_INSUFFICIENT_RESOURCES, + XE_RAS_STATUS_UNKNOWN_ERROR +}; /** * struct xe_ras_error_common - Error fields that are common across all products @@ -121,4 +181,173 @@ struct xe_ras_clear_counter_response { /** @reserved1: Reserved for future use */ u32 reserved1[3]; } __packed; -#endif + +/** + * struct xe_ras_error_array - Details of the error types + */ +struct xe_ras_error_array { + /** @counter_value: Counter value of the returned error */ + u32 counter_value; + /** @counter: Error counter */ + struct xe_ras_error_class counter; + /** @timestamp: Timestamp */ + u64 timestamp; + /** @details: Error details specific to the counter */ + u32 details[XE_RAS_NUM_COUNTERS]; +} __packed; + +/** + * struct xe_ras_get_soc_error - Response from get soc error command + */ +struct xe_ras_get_soc_error { + /** @num_errors: Number of errors reported in this response */ + u8 num_errors; + /** @additional_errors: Indicates if the errors are pending */ + u8 additional_errors; + /** @arr: Array of up to 3 errors */ + struct xe_ras_error_array arr[XE_RAS_NUM_ERROR_ARR]; +} __packed; + +/** + * struct xe_ras_compute_error - Error details of Core Compute error + */ +struct xe_ras_compute_error { + /** @log_header: Error Source and type */ + u32 log_header; + /** @reserved: Reserved */ + u32 reserved[15]; +} __packed; + +/** + * struct xe_ras_soc_error_source - Source of SoC error + */ +struct xe_ras_soc_error_source { + /** @csc: CSC */ + u32 csc:1; + /** @ieh: IEH (Integrated Error Handler) */ + u32 ieh:1; + /** @reserved: Reserved for future use */ + u32 reserved:30; +} __packed; + +/** + * struct xe_ras_soc_error - Error details of SoC internal error + */ +struct xe_ras_soc_error { + /** @source: Error source */ + struct xe_ras_soc_error_source source; + /** @details: Error details specific to the error source */ + u32 details[15]; +} __packed; + +/** + * struct xe_ras_csc_error - CSC error details + */ +struct xe_ras_csc_error { + /** @reserved: Reserved */ + u32 reserved; + /** @hec_fw_error: CSC firmware error */ + u32 hec_fw_error; +} __packed; + +/** + * struct xe_ras_ieh_error - SoC IEH (Integrated Error Handler) error details + */ +struct xe_ras_ieh_error { + /** @ieh_instance: IEH instance */ + u32 ieh_instance:2; + /** @reserved: Reserved for future use */ + u32 reserved:30; + /** @global_error_status: Global error status */ + u32 global_error_status; + /** @local_error_status: Local error status */ + u32 local_error_status; + /** @gerr_mask: Global error mask */ + u32 gerr_mask; + /** @info: Additional information */ + u32 info[10]; +} __packed; + +/** + * struct xe_ras_memory_error - Device memory error details + */ +struct xe_ras_memory_error { + /** @category: Device memory error category */ + u8 category; + /** @reserved: Reserved for future use */ + u8 reserved[7]; + /** @hardware_address: Hardware physical address details */ + u64 hardware_address; + /** @sw_address: Software address where error occurred */ + u64 sw_address; + /** @reserved1: Reserved */ + u32 reserved1[10]; +} __packed; + +/** + * struct xe_ras_offline_list_request - Request for get offline list command + */ +struct xe_ras_offline_list_request { + /** @index: Zero-based index into the offline page list */ + u32 index; +} __packed; + +/** + * struct xe_ras_offline_list_response - Response from get offline list command + */ +struct xe_ras_offline_list_response { + /** @max_entries: Total no of pages that can be stored in flash */ + u32 max_entries; + /** @total_pages: Total number of permanently offlined pages */ + u32 total_pages; + /** @pages_returned: Number of pages returned in this response */ + u32 pages_returned; + /** @page_addresses: Array of permanently offlined page addresses (4KB aligned) */ + u64 page_addresses[XE_RAS_NUM_PAGES]; + /** @additional_data: Indicates if more data is available */ + u8 additional_data; + /** @reserved: Reserved for future use */ + u8 reserved[3]; +} __packed; + +/** + * struct xe_ras_page_offline_queue - Response from get offline queue command + */ +struct xe_ras_page_offline_queue { + /** @total_pages: Total number of queued pages */ + u32 total_pages; + /** @pages_returned: Number of pages returned in this response */ + u32 pages_returned; + /** @page_addresses: Array of page addresses (4KB aligned) */ + u64 page_addresses[XE_RAS_NUM_PAGES]; + /** @additional_data: Indicates if more data is available */ + u8 additional_data; + /** @reserved: Reserved for future use */ + u8 reserved[3]; +} __packed; + +/** + * struct xe_ras_page_offline_request - Request for page offline command + * + * This structure provides the request format to offline/decline a page + */ +struct xe_ras_page_offline_request { + /** @page_address: Page address (4KB aligned) */ + u64 page_address; + /** @action: Action to be performed, see &enum xe_ras_page_action */ + u32 action; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; + +/** + * struct xe_ras_page_offline_response - Response from page offline command + */ +struct xe_ras_page_offline_response { + /** @status: Status of the page offline request, see &enum xe_ras_response_status */ + u32 status; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; + +#endif /* _XE_RAS_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 427afd144f3a..4c506027fa94 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -54,7 +54,6 @@ * # cat /sys/bus/pci/devices/<device>/survivability_mode * Boot * - * * Any additional debug information if present will be visible under the directory * ``survivability_info``:: * @@ -98,6 +97,15 @@ * # cat /sys/bus/pci/devices/<device>/survivability_mode * Runtime * + * On some CSC firmware errors, PCODE sets FDO mode and the only recovery possible is through + * firmware flash using SPI driver. Userspace can check if FDO mode is set by checking the below + * sysfs entry. + * + * .. code-block:: shell + * + * # cat /sys/bus/pci/devices/<device>/survivability_info/fdo_mode + * enabled + * * When such errors occur, userspace is notified with the drm device wedged uevent and runtime * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd * to restore device to normal operation. @@ -296,7 +304,8 @@ static int create_survivability_sysfs(struct pci_dev *pdev) if (ret) return ret; - if (check_boot_failure(xe)) { + /* Survivability info is not required if enabled via configfs */ + if (!xe_configfs_get_survivability_mode(pdev)) { ret = devm_device_add_group(dev, &survivability_info_group); if (ret) return ret; diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event.c b/drivers/gpu/drm/xe/xe_sysctrl_event.c index b4d17329af6c..da395148ee9d 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_event.c +++ b/drivers/gpu/drm/xe/xe_sysctrl_event.c @@ -16,7 +16,7 @@ static void get_pending_event(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_c { struct xe_sysctrl_event_response *response = command->data_out; struct xe_device *xe = sc_to_xe(sc); - u32 count = XE_SYSCTRL_EVENT_FLOOD; + u32 count = XE_SYSCTRL_FLOOD_LIMIT; size_t len; int ret; diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h index c16c66b9fa7f..348768ca454a 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h @@ -10,9 +10,6 @@ #define XE_SYSCTRL_EVENT_DATA_LEN 59 -/* Modify as needed */ -#define XE_SYSCTRL_EVENT_FLOOD 16 - /** * enum xe_sysctrl_event - Events reported by System Controller * diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h index fb434cc165b2..d34e9a9dbb0e 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h @@ -30,5 +30,4 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc); int xe_sysctrl_send_command(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_command *cmd, size_t *rdata_len); - #endif diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h index 6e3753554510..5afd1ed16db2 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h @@ -22,14 +22,22 @@ enum xe_sysctrl_group { /** * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group * + * @XE_SYSCTRL_CMD_GET_SOC_ERROR: Retrieve basic error information * @XE_SYSCTRL_CMD_GET_COUNTER: Get error counter value * @XE_SYSCTRL_CMD_CLEAR_COUNTER: Clear error counter value * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event + * @XE_SYSCTRL_CMD_PAGE_OFFLINE: Instruct firmware to offline/decline a page + * @XE_SYSCTRL_CMD_GET_OFFLINE_LIST: Retrieve list of all offlined pages from flash + * @XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE: Retrieve list of offlined queued pages from firmware */ enum xe_sysctrl_gfsp_cmd { + XE_SYSCTRL_CMD_GET_SOC_ERROR = 0x01, XE_SYSCTRL_CMD_GET_COUNTER = 0x03, XE_SYSCTRL_CMD_CLEAR_COUNTER = 0x04, XE_SYSCTRL_CMD_GET_PENDING_EVENT = 0x07, + XE_SYSCTRL_CMD_PAGE_OFFLINE = 0x08, + XE_SYSCTRL_CMD_GET_OFFLINE_LIST = 0x09, + XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE = 0x0A, }; /** @@ -52,6 +60,9 @@ struct xe_sysctrl_mailbox_command { size_t data_out_len; }; +/* Modify as needed */ +#define XE_SYSCTRL_FLOOD_LIMIT 16 + #define XE_SYSCTRL_MB_FRAME_SIZE 16 #define XE_SYSCTRL_MB_MAX_FRAMES 64 #define XE_SYSCTRL_MB_MAX_MESSAGE_SIZE \ -- 2.34.1
