From: Riana Tauro <[email protected]> DO NOT REVIEW. COMPILATION ONLY This patch is from https://patchwork.freedesktop.org/series/160482/ Added only for Compilation.
Signed-off-by: Riana Tauro <[email protected]> Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_device.c | 24 +- drivers/gpu/drm/xe/xe_device.h | 27 +- drivers/gpu/drm/xe/xe_device_types.h | 12 +- drivers/gpu/drm/xe/xe_gt.c | 14 +- drivers/gpu/drm/xe/xe_guc_submit.c | 9 +- drivers/gpu/drm/xe/xe_pci.c | 9 + drivers/gpu/drm/xe/xe_pci_error.c | 135 +++++ drivers/gpu/drm/xe/xe_pci_error.h | 13 + drivers/gpu/drm/xe/xe_ras.c | 475 ++++++++++++++++++ drivers/gpu/drm/xe/xe_ras.h | 4 + drivers/gpu/drm/xe/xe_ras_types.h | 215 ++++++++ drivers/gpu/drm/xe/xe_survivability_mode.c | 13 +- drivers/gpu/drm/xe/xe_sysctrl_event.c | 2 +- drivers/gpu/drm/xe/xe_sysctrl_event_types.h | 3 - drivers/gpu/drm/xe/xe_sysctrl_mailbox.c | 28 ++ drivers/gpu/drm/xe/xe_sysctrl_mailbox.h | 4 +- drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h | 11 + 18 files changed, 971 insertions(+), 28 deletions(-) create mode 100644 drivers/gpu/drm/xe/xe_pci_error.c create mode 100644 drivers/gpu/drm/xe/xe_pci_error.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 09661f079d03..091872771e98 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -101,6 +101,7 @@ xe-y += xe_bb.o \ xe_page_reclaim.o \ xe_pat.o \ xe_pci.o \ + xe_pci_error.o \ xe_pci_rebar.o \ xe_pcode.o \ xe_pm.o \ diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 51e3a2dd7b22..7ee2148f1321 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -61,6 +61,7 @@ #include "xe_psmi.h" #include "xe_pxp.h" #include "xe_query.h" +#include "xe_ras.h" #include "xe_shrinker.h" #include "xe_soc_remapper.h" #include "xe_survivability_mode.h" @@ -915,7 +916,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg) { struct xe_device *xe = arg; - if (atomic_read(&xe->wedged.flag)) + if (atomic_read(&xe->wedged.fini)) xe_pm_runtime_put(xe); } @@ -988,6 +989,16 @@ int xe_device_probe(struct xe_device *xe) if (err) return err; + err = xe_soc_remapper_init(xe); + if (err) + return err; + + err = xe_sysctrl_init(xe); + if (err) + return err; + + xe_ras_init(xe); + /* * Now that GT is initialized (TTM in particular), * we can try to init display, and inherit the initial fb. @@ -1028,10 +1039,6 @@ int xe_device_probe(struct xe_device *xe) xe_nvm_init(xe); - err = xe_soc_remapper_init(xe); - if (err) - return err; - err = xe_heci_gsc_init(xe); if (err) return err; @@ -1070,10 +1077,6 @@ int xe_device_probe(struct xe_device *xe) if (err) goto err_unregister_display; - err = xe_sysctrl_init(xe); - if (err) - goto err_unregister_display; - err = xe_device_sysfs_init(xe); if (err) goto err_unregister_display; @@ -1411,7 +1414,8 @@ void xe_device_declare_wedged(struct xe_device *xe) return; } - if (!atomic_xchg(&xe->wedged.flag, 1)) { + if (!atomic_xchg(&xe->wedged.fini, 1)) { + xe_device_wedged_get(xe); xe->needs_flr_on_fini = true; xe_pm_runtime_get_noresume(xe); drm_err(&xe->drm, diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 975768a6a9c8..e177c05a7a95 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -181,6 +181,21 @@ static inline bool xe_device_has_mert(const struct xe_device *xe) return xe->info.has_mert; } +static inline bool xe_device_is_in_reset(struct xe_device *xe) +{ + return atomic_read(&xe->in_reset); +} + +static inline void xe_device_set_in_reset(struct xe_device *xe) +{ + atomic_set(&xe->in_reset, 1); +} + +static inline void xe_device_clear_in_reset(struct xe_device *xe) +{ + atomic_set(&xe->in_reset, 0); +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size); void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); @@ -192,9 +207,19 @@ bool xe_device_is_l2_flush_optimized(struct xe_device *xe); void xe_device_td_flush(struct xe_device *xe); void xe_device_l2_flush(struct xe_device *xe); +static inline void xe_device_wedged_get(struct xe_device *xe) +{ + atomic_inc(&xe->wedged.ref); +} + +static inline void xe_device_wedged_put(struct xe_device *xe) +{ + atomic_dec(&xe->wedged.ref); +} + static inline bool xe_device_wedged(struct xe_device *xe) { - return atomic_read(&xe->wedged.flag); + return atomic_read(&xe->wedged.ref); } void xe_device_set_wedged_method(struct xe_device *xe, unsigned long method); diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 32dd2ffbc796..bf43a3277d1e 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -483,10 +483,15 @@ struct xe_device { /** @needs_flr_on_fini: requests function-reset on fini */ bool needs_flr_on_fini; + /** @in_reset: Indicates if device is in reset */ + atomic_t in_reset; + /** @wedged: Struct to control Wedged States and mode */ struct { - /** @wedged.flag: Xe device faced a critical error and is now blocked. */ - atomic_t flag; + /** @wedged.fini: Needs cleanup on fini */ + atomic_t fini; + /** @wedged.ref: Refcount for wedged device, blocks critical path execution */ + atomic_t ref; /** @wedged.mode: Mode controlled by kernel parameter and debugfs */ enum xe_wedged_mode mode; /** @wedged.method: Recovery method to be sent in the drm device wedged uevent */ @@ -495,6 +500,9 @@ struct xe_device { bool inconsistent_reset; } wedged; + /** @devres_group_id: id for devres group */ + void *devres_group_id; + /** @bo_device: Struct to control async free of BOs */ struct xe_bo_dev { /** @bo_device.async_free: Free worker */ diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 783eb6d631b5..d904527a8898 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -917,6 +917,9 @@ static void gt_reset_worker(struct work_struct *w) if (xe_device_wedged(gt_to_xe(gt))) goto err_pm_put; + if (xe_device_is_in_reset(gt_to_xe(gt))) + goto err_pm_put; + /* We only support GT resets with GuC submission */ if (!xe_device_uc_enabled(gt_to_xe(gt))) goto err_pm_put; @@ -977,18 +980,21 @@ static void gt_reset_worker(struct work_struct *w) void xe_gt_reset_async(struct xe_gt *gt) { - xe_gt_info(gt, "trying reset from %ps\n", __builtin_return_address(0)); + struct xe_device *xe = gt_to_xe(gt); + + if (xe_device_is_in_reset(xe)) + return; /* Don't do a reset while one is already in flight */ if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(>->uc)) return; - xe_gt_info(gt, "reset queued\n"); + xe_gt_info(gt, "reset queued from %ps\n", __builtin_return_address(0)); /* Pair with put in gt_reset_worker() if work is enqueued */ - xe_pm_runtime_get_noresume(gt_to_xe(gt)); + xe_pm_runtime_get_noresume(xe); if (!queue_work(gt->ordered_wq, >->reset.worker)) - xe_pm_runtime_put(gt_to_xe(gt)); + xe_pm_runtime_put(xe); } void xe_gt_suspend_prepare(struct xe_gt *gt) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 4b247a3019d2..b2870d7ab8ce 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1511,7 +1511,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * If devcoredump not captured and GuC capture for the job is not ready * do manual capture first and decide later if we need to use it */ - if (!exec_queue_killed(q) && !xe->devcoredump.captured && + if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q) && !xe->devcoredump.captured && !xe_guc_capture_get_matching_and_lock(q)) { /* take force wake before engine register manual capture */ CLASS(xe_force_wake, fw_ref)(gt_to_fw(q->gt), XE_FORCEWAKE_ALL); @@ -1533,8 +1533,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) set_exec_queue_banned(q); /* Kick job / queue off hardware */ - if (!wedged && (exec_queue_enabled(primary) || - exec_queue_pending_disable(primary))) { + if (!xe_device_is_in_reset(xe) && !wedged && + (exec_queue_enabled(primary) || exec_queue_pending_disable(primary))) { int ret; if (exec_queue_reset(primary)) @@ -1602,7 +1602,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) trace_xe_sched_job_timedout(job); - if (!exec_queue_killed(q)) + /* Do not access device if in reset */ + if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q)) xe_devcoredump(q, job, "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx", xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 78fc2e4dcfc6..ab74a5852dbd 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -26,6 +26,7 @@ #include "xe_guc.h" #include "xe_mmio.h" #include "xe_module.h" +#include "xe_pci_error.h" #include "xe_pci_rebar.h" #include "xe_pci_sriov.h" #include "xe_pci_types.h" @@ -1076,6 +1077,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) const struct xe_device_desc *desc = (const void *)ent->driver_data; const struct xe_subplatform_desc *subplatform_desc; struct xe_device *xe; + void *devres_id; int err; subplatform_desc = find_subplatform(desc, pdev->device); @@ -1103,6 +1105,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (xe_display_driver_probe_defer(pdev)) return -EPROBE_DEFER; + devres_id = devres_open_group(&pdev->dev, NULL, GFP_KERNEL); + if (!devres_id) + return -ENOMEM; + err = pcim_enable_device(pdev); if (err) return err; @@ -1111,6 +1117,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (IS_ERR(xe)) return PTR_ERR(xe); + xe->devres_group_id = devres_id; + pci_set_drvdata(pdev, &xe->drm); xe_pm_assert_unbounded_bridge(xe); @@ -1349,6 +1357,7 @@ static struct pci_driver xe_pci_driver = { .remove = xe_pci_remove, .shutdown = xe_pci_shutdown, .sriov_configure = xe_pci_sriov_configure, + .err_handler = &xe_pci_error_handlers, #ifdef CONFIG_PM_SLEEP .driver.pm = &xe_pm_ops, #endif diff --git a/drivers/gpu/drm/xe/xe_pci_error.c b/drivers/gpu/drm/xe/xe_pci_error.c new file mode 100644 index 000000000000..b08601f470d6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci_error.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include <linux/pci.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_pci.h" +#include "xe_printk.h" +#include "xe_ras.h" +#include "xe_survivability_mode.h" + +static void prepare_device_for_reset(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_gt *gt; + u8 id; + + xe_device_set_in_reset(xe); + + /* Wedge the device to prevent userspace access during reset */ + xe_device_wedged_get(xe); + + for_each_gt(gt, xe, id) + xe_gt_declare_wedged(gt); + + pci_disable_device(pdev); +} + +static pci_ers_result_t ras_action_to_pci_result(struct pci_dev *pdev, u32 action) +{ + switch (action) { + case XE_RAS_RECOVERY_ACTION_RECOVERED: + return PCI_ERS_RESULT_RECOVERED; + case XE_RAS_RECOVERY_ACTION_RESET: + prepare_device_for_reset(pdev); + return PCI_ERS_RESULT_NEED_RESET; + case XE_RAS_RECOVERY_ACTION_DISCONNECT: + return PCI_ERS_RESULT_DISCONNECT; + default: + return PCI_ERS_RESULT_DISCONNECT; + } +} + +static pci_ers_result_t xe_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: detected state = %u\n", state); + + if (state == pci_channel_io_perm_failure) + return PCI_ERS_RESULT_DISCONNECT; + + /* If the device is already wedged or in survivability mode, do not attempt recovery */ + if (xe_survivability_mode_is_boot_enabled(xe) || xe_device_wedged(xe)) + return PCI_ERS_RESULT_DISCONNECT; + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + prepare_device_for_reset(pdev); + return PCI_ERS_RESULT_NEED_RESET; + default: + xe_err(xe, "PCI error: unknown state %d\n", state); + return PCI_ERS_RESULT_NEED_RESET; + } +} + +static pci_ers_result_t xe_pci_error_mmio_enabled(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + enum xe_ras_recovery_action action; + + xe_err(xe, "PCI error: MMIO enabled\n"); + + action = xe_ras_process_errors(xe); + + return ras_action_to_pci_result(pdev, action); +} + +static pci_ers_result_t xe_pci_error_slot_reset(struct pci_dev *pdev) +{ + const struct pci_device_id *ent = pci_match_id(pdev->driver->id_table, pdev); + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: slot reset\n"); + + pci_restore_state(pdev); + + if (pci_enable_device(pdev)) { + xe_err(xe, "Cannot re-enable PCI device after reset\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + + /* + * Secondary Bus Reset causes all VRAM state to be lost along with + * hardware state. As an initial step, re-probe the device to + * re-initialize the driver and hardware. + * TODO: optimize by re-initializing only the hardware state and re-creating + * kernel BOs. + */ + xe_device_clear_in_reset(xe); + pdev->driver->remove(pdev); + devres_release_group(&pdev->dev, xe->devres_group_id); + + if (pdev->driver->probe(pdev, ent)) + return PCI_ERS_RESULT_DISCONNECT; + + xe = pdev_to_xe_device(pdev); + + /* Wedge the device to prevent I/O operations till the resume callback */ + xe_device_wedged_get(xe); + + return PCI_ERS_RESULT_RECOVERED; +} + +static void xe_pci_error_resume(struct pci_dev *pdev) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_err(xe, "PCI error: resume\n"); + + /* Resume I/O operations */ + xe_device_wedged_put(xe); +} + +const struct pci_error_handlers xe_pci_error_handlers = { + .error_detected = xe_pci_error_detected, + .mmio_enabled = xe_pci_error_mmio_enabled, + .slot_reset = xe_pci_error_slot_reset, + .resume = xe_pci_error_resume, +}; diff --git a/drivers/gpu/drm/xe/xe_pci_error.h b/drivers/gpu/drm/xe/xe_pci_error.h new file mode 100644 index 000000000000..725ad0214e62 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci_error.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef _XE_PCI_ERROR_H_ +#define _XE_PCI_ERROR_H_ + +struct pci_error_handlers; + +extern const struct pci_error_handlers xe_pci_error_handlers; + +#endif diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 4cb16b419b0c..1f3c4a5f39d9 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -3,12 +3,19 @@ * Copyright © 2026 Intel Corporation */ +#include "xe_bo.h" #include "xe_device.h" #include "xe_printk.h" #include "xe_ras.h" #include "xe_ras_types.h" +#include "xe_survivability_mode.h" #include "xe_sysctrl.h" #include "xe_sysctrl_event_types.h" +#include "xe_sysctrl_mailbox.h" +#include "xe_sysctrl_mailbox_types.h" + +#define CORE_COMPUTE_UNCORR_TYPE GENMASK(26, 25) +#define GLOBAL_UNCORR_ERROR 2 /* Severity of detected errors */ enum xe_ras_severity { @@ -66,6 +73,290 @@ static inline const char *comp_to_str(u8 component) return xe_ras_components[component]; } +static int ras_status_to_errno(u32 status) +{ + switch (status) { + case XE_RAS_STATUS_SUCCESS: + return 0; + case XE_RAS_STATUS_INVALID_PARAM: + return -EINVAL; + case XE_RAS_STATUS_OP_NOT_SUPPORTED: + return -EOPNOTSUPP; + case XE_RAS_STATUS_TIMEOUT: + return -ETIMEDOUT; + case XE_RAS_STATUS_HARDWARE_FAILURE: + return -EIO; + case XE_RAS_STATUS_INSUFFICIENT_RESOURCES: + return -ENOSPC; + default: + return -EPROTO; + } +} + +static int send_page_offline(struct xe_device *xe, enum xe_ras_page_action action, u64 page_address) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_page_offline_request request = {0}; + struct xe_ras_page_offline_response response = {0}; + size_t rlen; + int ret; + + if (!xe->info.has_sysctrl) + return 0; + + if (action >= XE_RAS_PAGE_ACTION_MAX) { + xe_err(xe, "[RAS]: Invalid page offline action %d\n", action); + return -EINVAL; + } + + request.page_address = page_address; + request.action = action; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_PAGE_OFFLINE, + &request, sizeof(request), &response, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to send page offline command %d\n", ret); + return ret; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return -EINVAL; + } + + ret = ras_status_to_errno(response.status); + if (ret) { + xe_err(xe, "sysctrl: page offline command failed with status %d\n", + response.status); + } + + return ret; +} + +static int handle_page_offline(struct xe_device *xe, u64 page_address, bool send_offline_cmd) +{ + enum xe_ras_page_action action; + int ret = 0; + + if (!IS_ALIGNED(page_address, XE_PAGE_SIZE)) { + xe_err(xe, "sysctrl: Unaligned page address: 0x%llx\n", page_address); + return -EINVAL; + } + + /* + * TODO: Call function to handle address fault + * ret = xe_ttm_vram_handle_addr_fault(xe, page_address); + */ + + /* + * Handle return code from address fault handling function: + * 0: Address is valid and can be offlined + * -EIO: Address belongs to a critical BO that cannot be offlined + * -EOPNOTSUPP: Address is valid and can be offlined but user policy is not to offline + * + * For any other non-zero error code, skip offlining. + */ + + switch (ret) { + case 0: + action = XE_RAS_PAGE_ACTION_OFFLINE; + break; + /* User policy set to decline page offlining */ + case -EOPNOTSUPP: + action = XE_RAS_PAGE_ACTION_DECLINE; + break; + case -EIO: + xe_err(xe, "[RAS]: Page address belongs to critical BO: 0x%llx\n", + page_address); + return ret; + default: + xe_err(xe, "[RAS]: Failed to handle address fault 0x%llx: %d\n", + page_address, ret); + return 0; + } + + if (send_offline_cmd) { + ret = send_page_offline(xe, action, page_address); + if (ret) + xe_err(xe, "sysctrl: Failed to offline page for address 0x%llx: %d\n", + page_address, ret); + return ret; + } + + return 0; +} + +static enum xe_ras_recovery_action handle_core_compute_errors(struct xe_ras_error_array *arr) +{ + struct xe_ras_compute_error *error_info = (void *)arr->details; + u8 uncorr_type; + + uncorr_type = FIELD_GET(CORE_COMPUTE_UNCORR_TYPE, error_info->log_header); + + /* Request a reset if error is global */ + if (uncorr_type == GLOBAL_UNCORR_ERROR) + return XE_RAS_RECOVERY_ACTION_RESET; + + /* + * No action needed for other errors. + * Local errors are recovered using an engine reset by GuC. + */ + return XE_RAS_RECOVERY_ACTION_RECOVERED; +} + +static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device *xe, + struct xe_ras_error_array *arr) +{ + struct xe_ras_soc_error *info = (void *)arr->details; + struct xe_ras_soc_error_source *source = &info->source; + struct xe_ras_error_class *counter = &arr->counter; + + if (source->csc) { + struct xe_ras_csc_error *csc_error = (void *)info->details; + + /* + * CSC uncorrectable errors are classified as hardware errors and firmware errors. + * CSC firmware errors are critical errors that can be recovered only by firmware + * update via SPI driver. On a CSC firmware error, PCODE enables FDO mode and sets + * the bit in the capability register. On receiving this error, the driver enables + * runtime survivability mode which notifies userspace that a firmware update + * is required. + */ + if (csc_error->hec_fw_error) { + xe_err(xe, "[RAS]: CSC %s detected: 0x%x\n", + sev_to_str(counter->common.severity), + csc_error->hec_fw_error); + xe_survivability_mode_runtime_enable(xe); + return XE_RAS_RECOVERY_ACTION_DISCONNECT; + } + } else if (source->ieh) { + struct xe_ras_ieh_error *ieh_error = (void *)info->details; + + if (ieh_error->global_error_status & XE_RAS_SOC_IEH_PUNIT) { + xe_err(xe, "[RAS]: PUNIT %s detected: 0x%x\n", + sev_to_str(counter->common.severity), + ieh_error->global_error_status); + /* TODO: Add PUNIT error handling */ + return XE_RAS_RECOVERY_ACTION_DISCONNECT; + } + } + + /* For other SOC internal errors, request a reset as recovery mechanism */ + return XE_RAS_RECOVERY_ACTION_RESET; +} + +static enum xe_ras_recovery_action handle_device_memory_errors(struct xe_device *xe, + struct xe_ras_error_array *arr) +{ + struct xe_ras_memory_error *info = (void *)arr->details; + int ret; + + if (info->category & XE_RAS_MEMORY_ECC) { + xe_err(xe, "[RAS]: double-bit ECC error detected at sw address 0x%llx\n", + info->sw_address); + ret = handle_page_offline(xe, info->sw_address, true); + if (!ret) + return XE_RAS_RECOVERY_ACTION_RECOVERED; + } + + /* Request a reset for other device memory errors and if page offlining failed */ + return XE_RAS_RECOVERY_ACTION_RESET; +} + +static void get_queued_pages(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_page_offline_queue response = {0}; + u32 count = 0; + size_t rlen; + int ret, i; + + /* Supported only on platforms with system controller */ + if (!xe->info.has_sysctrl) + return; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, + XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE, NULL, 0, &response, + sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get page offline queue %d\n", ret); + return; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline queue response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return; + } + + for (i = 0; i < response.pages_returned && i < XE_RAS_NUM_PAGES; i++) + handle_page_offline(xe, response.page_addresses[i], true); + + count += response.pages_returned; + if (!response.pages_returned) + break; + + if (count > response.total_pages) { + xe_err(xe, "sysctrl: Pages returned from queue exceed total pages %u, returned %u\n", + response.total_pages, count); + return; + } + } while (response.additional_data); +} + +static void get_offlined_list(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_page_offline_list response = {0}; + u32 count = 0; + size_t rlen; + int ret, i; + + /* Supported only on platforms with system controller */ + if (!xe->info.has_sysctrl) + return; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_GET_OFFLINE_LIST, + NULL, 0, &response, sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get page offline list %d\n", ret); + return; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected page offline list response length %zu (expected %zu)\n", + rlen, sizeof(response)); + return; + } + + for (i = 0; i < response.pages_returned && i < XE_RAS_NUM_PAGES; i++) + handle_page_offline(xe, response.page_addresses[i], false); + + count += response.pages_returned; + if (!response.pages_returned) + break; + + if (count > response.total_pages) { + xe_err(xe, "sysctrl: Pages returned from list exceed total pages %u, returned %u\n", + response.total_pages, count); + return; + } + } while (response.additional_data); +} + void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response) { @@ -91,3 +382,187 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, comp_to_str(component), sev_to_str(severity)); } } + +/** + * xe_ras_process_errors() - Process and contain hardware errors + * @xe: xe device instance + * + * Get error details from system controller and return recovery + * method. Called only from PCI error handling. + * + * Returns: recovery action to be taken + */ +enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe) +{ + struct xe_sysctrl_mailbox_command command = {0}; + struct xe_ras_get_soc_error response; + enum xe_ras_recovery_action final_action; + u32 remaining = XE_SYSCTRL_FLOOD_LIMIT; + size_t rlen; + int ret; + + if (!xe->info.has_sysctrl) + return XE_RAS_RECOVERY_ACTION_RESET; + + /* Default action */ + final_action = XE_RAS_RECOVERY_ACTION_RECOVERED; + + xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, XE_SYSCTRL_CMD_GET_SOC_ERROR, + NULL, 0, &response, sizeof(response)); + + do { + memset(&response, 0, sizeof(response)); + + ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen); + if (ret) { + xe_err(xe, "sysctrl: failed to get soc error %d\n", ret); + goto err; + } + + if (rlen != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected get soc error response length %zu (expected %zu)\n", + rlen, sizeof(response)); + goto err; + } + + /* Report if number of errors exceeds the maximum errors supported */ + if (response.num_errors > XE_RAS_NUM_ERROR_ARR) + xe_err(xe, "sysctrl: number of errors received %d out of bound (%d)\n", + response.num_errors, XE_RAS_NUM_ERROR_ARR); + + for (int i = 0; i < response.num_errors && i < XE_RAS_NUM_ERROR_ARR; i++) { + struct xe_ras_error_array *arr = &response.arr[i]; + enum xe_ras_recovery_action action; + u8 component, severity; + + component = arr->counter.common.component; + severity = arr->counter.common.severity; + + xe_err(xe, "[RAS]: %s %s detected\n", comp_to_str(component), + sev_to_str(severity)); + + switch (component) { + case XE_RAS_COMP_CORE_COMPUTE: + action = handle_core_compute_errors(arr); + break; + case XE_RAS_COMP_SOC_INTERNAL: + action = handle_soc_internal_errors(xe, arr); + break; + case XE_RAS_COMP_DEVICE_MEMORY: + action = handle_device_memory_errors(xe, arr); + break; + default: + /* For any other component, reset */ + action = XE_RAS_RECOVERY_ACTION_RESET; + break; + } + + /* Process and log all errors and then trigger highest recovery action */ + if (action > final_action) + final_action = action; + } + + /* Treat flooding as an system controller error */ + if (!--remaining) { + xe_err(xe, "[RAS]: sysctrl: get soc error response flooding\n"); + return XE_RAS_RECOVERY_ACTION_RESET; + } + + } while (response.additional_errors); + + return final_action; + +err: + return XE_RAS_RECOVERY_ACTION_RESET; +} + +static struct pci_dev *find_usp_dev(struct pci_dev *pdev) +{ + struct pci_dev *vsp; + + /* + * Device Hierarchy: + * + * Upstream Switch Port (USP) --> Virtual Switch Port (VSP) --> SGunit (GPU endpoint) + */ + vsp = pci_upstream_bridge(pdev); + if (!vsp) + return NULL; + + return pci_upstream_bridge(vsp); +} + +static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + u32 aer_uncorr_mask, aer_uncorr_sev, aer_uncorr_status; + struct pci_dev *usp; + u16 aer_cap; + + usp = find_usp_dev(pdev); + if (!usp) + return; + + aer_cap = usp->aer_cap; + if (!aer_cap) { + dev_info(&usp->dev, "AER capability unavailable\n"); + return; + } + + /* + * Clear any stale Uncorrectable Internal Error Status event in Uncorrectable Error + * Status Register. + */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, &aer_uncorr_status); + if (aer_uncorr_status & PCI_ERR_UNC_INTN) + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_INTN); + + /* + * All errors are steered to USP which is a PCIe AER Compliant device. + * Downgrade all the errors to non-fatal to prevent PCIe bus driver + * from triggering a Secondary Bus Reset (SBR). This allows error + * detection, containment and recovery in the driver. + * + * The Uncorrectable Error Severity Register has the 'Uncorrectable + * Internal Error Severity' set to fatal by default. Set this to + * non-fatal and unmask the error. + */ + + /* Initialize Uncorrectable Error Severity Register */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, &aer_uncorr_sev); + aer_uncorr_sev &= ~PCI_ERR_UNC_INTN; + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, aer_uncorr_sev); + + /* Initialize Uncorrectable Error Mask Register */ + pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, &aer_uncorr_mask); + aer_uncorr_mask &= ~PCI_ERR_UNC_INTN; + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, aer_uncorr_mask); + + pci_save_state(usp); + dev_dbg(&usp->dev, "Uncorrectable Internal Errors downgraded and unmasked\n"); +} + +/** + * xe_ras_init - Initialize Xe RAS + * @xe: xe device instance + * + * Initialize Xe RAS + */ +void xe_ras_init(struct xe_device *xe) +{ + if (!xe->info.has_sysctrl) + return; + + if (IS_ENABLED(CONFIG_PCIEAER)) + aer_unmask_and_downgrade_internal_error(xe); + + get_queued_pages(xe); + get_offlined_list(xe); + + /* + * During probe, process and log any errors detected by firmware while the driver was not + * loaded. Critical errors such as Punit and CSC are reported through Pcode init failure, + * causing the driver to enter survivability mode. + */ + xe_ras_process_errors(xe); +} diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index ea90593b62dc..8d106c708ff1 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -6,10 +6,14 @@ #ifndef _XE_RAS_H_ #define _XE_RAS_H_ +#include "xe_ras_types.h" + struct xe_device; struct xe_sysctrl_event_response; void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response); +void xe_ras_init(struct xe_device *xe); +enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe); #endif diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h index 4e63c67f806a..af03921c7cf6 100644 --- a/drivers/gpu/drm/xe/xe_ras_types.h +++ b/drivers/gpu/drm/xe/xe_ras_types.h @@ -8,7 +8,63 @@ #include <linux/types.h> +#define XE_RAS_NUM_ERROR_ARR 3 #define XE_RAS_NUM_COUNTERS 16 +#define XE_RAS_SOC_IEH_PUNIT BIT(1) +#define XE_RAS_MEMORY_ECC BIT(1) +#define XE_RAS_NUM_PAGES 25 + +/** + * enum xe_ras_recovery_action - RAS recovery actions + * + * @XE_RAS_RECOVERY_ACTION_RECOVERED: Error recovered + * @XE_RAS_RECOVERY_ACTION_RESET: Requires reset + * @XE_RAS_RECOVERY_ACTION_DISCONNECT: Requires disconnect + * @XE_RAS_RECOVERY_ACTION_MAX: Max action value + * + * This enum defines the possible recovery actions that can be taken in response + * to RAS errors. + */ +enum xe_ras_recovery_action { + XE_RAS_RECOVERY_ACTION_RECOVERED = 0, + XE_RAS_RECOVERY_ACTION_RESET, + XE_RAS_RECOVERY_ACTION_DISCONNECT, + XE_RAS_RECOVERY_ACTION_MAX +}; + +/** + * enum xe_ras_page_action - Page offline actions for page offline request + * + * @XE_RAS_PAGE_ACTION_OFFLINE: Instruct firmware to remove page from queue + * @XE_RAS_PAGE_ACTION_DECLINE: Instruct firmware to mark page as not offline + * @XE_RAS_PAGE_ACTION_MAX: Max value for validation + */ +enum xe_ras_page_action { + XE_RAS_PAGE_ACTION_OFFLINE, + XE_RAS_PAGE_ACTION_DECLINE, + XE_RAS_PAGE_ACTION_MAX +}; + +/** + * enum xe_ras_response_status - RAS response status codes + * + * @XE_RAS_STATUS_SUCCESS: Operation successful + * @XE_RAS_STATUS_INVALID_PARAM: Invalid parameter + * @XE_RAS_STATUS_OP_NOT_SUPPORTED: Operation not supported + * @XE_RAS_STATUS_TIMEOUT: Operation timed out + * @XE_RAS_STATUS_HARDWARE_FAILURE: Hardware failure + * @XE_RAS_STATUS_INSUFFICIENT_RESOURCES: Insufficient resources + * @XE_RAS_STATUS_UNKNOWN_ERROR: Unknown error + */ +enum xe_ras_response_status { + XE_RAS_STATUS_SUCCESS = 0, + XE_RAS_STATUS_INVALID_PARAM, + XE_RAS_STATUS_OP_NOT_SUPPORTED, + XE_RAS_STATUS_TIMEOUT, + XE_RAS_STATUS_HARDWARE_FAILURE, + XE_RAS_STATUS_INSUFFICIENT_RESOURCES, + XE_RAS_STATUS_UNKNOWN_ERROR +}; /** * struct xe_ras_error_common - Error fields that are common across all products @@ -70,4 +126,163 @@ struct xe_ras_threshold_crossed { struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS]; } __packed; +/** + * struct xe_ras_error_array - Details of the error types + */ +struct xe_ras_error_array { + /** @counter_value: Counter value of the returned error */ + u32 counter_value; + /** @counter: Error counter */ + struct xe_ras_error_class counter; + /** @timestamp: Timestamp */ + u64 timestamp; + /** @details: Error details specific to the counter */ + u32 details[XE_RAS_NUM_COUNTERS]; +} __packed; + +/** + * struct xe_ras_get_soc_error - Response from get soc error command + */ +struct xe_ras_get_soc_error { + /** @num_errors: Number of errors reported in this response */ + u8 num_errors; + /** @additional_errors: Indicates if the errors are pending */ + u8 additional_errors; + /** @arr: Array of up to 3 errors */ + struct xe_ras_error_array arr[XE_RAS_NUM_ERROR_ARR]; +} __packed; + +/** + * struct xe_ras_compute_error - Error details of Core Compute error + */ +struct xe_ras_compute_error { + /** @log_header: Error Source and type */ + u32 log_header; + /** @reserved: Reserved */ + u32 reserved[15]; +} __packed; + +/** + * struct xe_ras_soc_error_source - Source of SoC error + */ +struct xe_ras_soc_error_source { + /** @csc: CSC */ + u32 csc:1; + /** @ieh: IEH (Integrated Error Handler) */ + u32 ieh:1; + /** @reserved: Reserved for future use */ + u32 reserved:30; +} __packed; + +/** + * struct xe_ras_soc_error - Error details of SoC internal error + */ +struct xe_ras_soc_error { + /** @source: Error source */ + struct xe_ras_soc_error_source source; + /** @details: Error details specific to the error source */ + u32 details[15]; +} __packed; + +/** + * struct xe_ras_csc_error - CSC error details + */ +struct xe_ras_csc_error { + /** @reserved: Reserved */ + u32 reserved; + /** @hec_fw_error: CSC firmware error */ + u32 hec_fw_error; +} __packed; + +/** + * struct xe_ras_ieh_error - SoC IEH (Integrated Error Handler) error details + */ +struct xe_ras_ieh_error { + /** @ieh_instance: IEH instance */ + u32 ieh_instance:2; + /** @reserved: Reserved for future use */ + u32 reserved:30; + /** @global_error_status: Global error status */ + u32 global_error_status; + /** @local_error_status: Local error status */ + u32 local_error_status; + /** @gerr_mask: Global error mask */ + u32 gerr_mask; + /** @info: Additional information */ + u32 info[10]; +} __packed; + +/** + * struct xe_ras_memory_error - Device memory error details + */ +struct xe_ras_memory_error { + /** @category: Device memory error category */ + u8 category; + /** @reserved: Reserved for future use */ + u8 reserved[7]; + /** @hardware_address: Hardware physical address details */ + u64 hardware_address; + /** @sw_address: Software address where error occurred */ + u64 sw_address; + /** @reserved1: Reserved */ + u32 reserved1[10]; +} __packed; + +/** + * struct xe_ras_page_offline_list - Response from get offline list command + */ +struct xe_ras_page_offline_list { + /** @max_entries: Total no of pages that can be stored in flash */ + u32 max_entries; + /** @total_pages: Total number of permanently offlined pages */ + u32 total_pages; + /** @pages_returned: Number of pages returned in this response */ + u32 pages_returned; + /** @page_addresses: Array of permanently offlined page addresses (4KB aligned) */ + u64 page_addresses[XE_RAS_NUM_PAGES]; + /** @additional_data: Indicates if more data is available */ + u8 additional_data; + /** @reserved: Reserved for future use */ + u8 reserved[3]; +} __packed; + +/** + * struct xe_ras_page_offline_queue - Response from get offline queue command + */ +struct xe_ras_page_offline_queue { + /** @total_pages: Total number of queued pages */ + u32 total_pages; + /** @pages_returned: Number of pages returned in this response */ + u32 pages_returned; + /** @page_addresses: Array of page addresses (4KB aligned) */ + u64 page_addresses[XE_RAS_NUM_PAGES]; + /** @additional_data: Indicates if more data is available */ + u8 additional_data; + /** @reserved: Reserved for future use */ + u8 reserved[3]; +} __packed; + +/** + * struct xe_ras_page_offline_request - Request for page offline command + * + * This structure provides the request format to offline/decline a page + */ +struct xe_ras_page_offline_request { + /** @page_address: Page address (4KB aligned) */ + u64 page_address; + /** @action: Action to be performed, see &enum xe_ras_page_action */ + u32 action; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; + +/** + * struct xe_ras_page_offline_response - Response from page offline command + */ +struct xe_ras_page_offline_response { + /** @status: Status of the page offline request, see &enum xe_ras_response_status */ + u32 status; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; #endif diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 427afd144f3a..4c506027fa94 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -54,7 +54,6 @@ * # cat /sys/bus/pci/devices/<device>/survivability_mode * Boot * - * * Any additional debug information if present will be visible under the directory * ``survivability_info``:: * @@ -98,6 +97,15 @@ * # cat /sys/bus/pci/devices/<device>/survivability_mode * Runtime * + * On some CSC firmware errors, PCODE sets FDO mode and the only recovery possible is through + * firmware flash using SPI driver. Userspace can check if FDO mode is set by checking the below + * sysfs entry. + * + * .. code-block:: shell + * + * # cat /sys/bus/pci/devices/<device>/survivability_info/fdo_mode + * enabled + * * When such errors occur, userspace is notified with the drm device wedged uevent and runtime * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd * to restore device to normal operation. @@ -296,7 +304,8 @@ static int create_survivability_sysfs(struct pci_dev *pdev) if (ret) return ret; - if (check_boot_failure(xe)) { + /* Survivability info is not required if enabled via configfs */ + if (!xe_configfs_get_survivability_mode(pdev)) { ret = devm_device_add_group(dev, &survivability_info_group); if (ret) return ret; diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event.c b/drivers/gpu/drm/xe/xe_sysctrl_event.c index b4d17329af6c..da395148ee9d 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_event.c +++ b/drivers/gpu/drm/xe/xe_sysctrl_event.c @@ -16,7 +16,7 @@ static void get_pending_event(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_c { struct xe_sysctrl_event_response *response = command->data_out; struct xe_device *xe = sc_to_xe(sc); - u32 count = XE_SYSCTRL_EVENT_FLOOD; + u32 count = XE_SYSCTRL_FLOOD_LIMIT; size_t len; int ret; diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h index c16c66b9fa7f..348768ca454a 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h @@ -10,9 +10,6 @@ #define XE_SYSCTRL_EVENT_DATA_LEN 59 -/* Modify as needed */ -#define XE_SYSCTRL_EVENT_FLOOD 16 - /** * enum xe_sysctrl_event - Events reported by System Controller * diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c index 3caa9f15875f..f49d8dabcf73 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c @@ -307,6 +307,34 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc) sc->phase_bit = (ctrl_reg & SYSCTRL_FRAME_PHASE) ? 1 : 0; } +/** + * xe_sysctrl_create_command() - Create System controller command structure + * @command: Sysctrl command structure + * @group_id: Command group ID + * @cmd_id: Command ID + * @request: Pointer to request buffer (can be NULL) + * @request_len: Size of request buffer + * @response: Pointer to response buffer + * @response_len: Size of response buffer + * + * Helper function to create sysctrl command to be sent via xe_sysctrl_send_command() + */ +void xe_sysctrl_create_command(struct xe_sysctrl_mailbox_command *command, u8 group_id, u8 cmd_id, + void *request, size_t request_len, void *response, + size_t response_len) +{ + struct xe_sysctrl_app_msg_hdr header = {0}; + + header.data = FIELD_PREP(APP_HDR_GROUP_ID_MASK, group_id) | + FIELD_PREP(APP_HDR_COMMAND_MASK, cmd_id); + + command->header = header; + command->data_in = request; + command->data_in_len = request_len; + command->data_out = response; + command->data_out_len = response_len; +} + /** * xe_sysctrl_send_command() - Send mailbox command to System Controller * @sc: System Controller instance diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h index f67e9234de48..0ba841b0be1b 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h @@ -27,5 +27,7 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc); int xe_sysctrl_send_command(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_command *cmd, size_t *rdata_len); - +void xe_sysctrl_create_command(struct xe_sysctrl_mailbox_command *command, u8 group_id, u8 cmd_id, + void *request, size_t request_len, void *response, + size_t response_len); #endif diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h index 84d7c647e743..f6cbb349c416 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h @@ -22,10 +22,18 @@ enum xe_sysctrl_group { /** * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group * + * @XE_SYSCTRL_CMD_GET_SOC_ERROR: Retrieve basic error information * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event + * @XE_SYSCTRL_CMD_PAGE_OFFLINE: Instruct firmware to offline/decline a page + * @XE_SYSCTRL_CMD_GET_OFFLINE_LIST: Retrieve list of all offlined pages from flash + * @XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE: Retrieve list of offlined queued pages from firmware */ enum xe_sysctrl_gfsp_cmd { + XE_SYSCTRL_CMD_GET_SOC_ERROR = 0x01, XE_SYSCTRL_CMD_GET_PENDING_EVENT = 0x07, + XE_SYSCTRL_CMD_PAGE_OFFLINE = 0x08, + XE_SYSCTRL_CMD_GET_OFFLINE_LIST = 0x09, + XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE = 0x0A, }; /** @@ -48,6 +56,9 @@ struct xe_sysctrl_mailbox_command { size_t data_out_len; }; +/* Modify as needed */ +#define XE_SYSCTRL_FLOOD_LIMIT 16 + #define XE_SYSCTRL_MB_FRAME_SIZE 16 #define XE_SYSCTRL_MB_MAX_FRAMES 64 #define XE_SYSCTRL_MB_MAX_MESSAGE_SIZE \ -- 2.34.1
