[PATCH v7 1/6] Introduce Xe Uncorrectable Error Handling

Mallesh Koujalagi Fri, 05 Jun 2026 05:29:22 -0700

From: Riana Tauro <[email protected]>

DO NOT REVIEW. COMPILATION ONLY
This patch is from https://patchwork.freedesktop.org/series/160482/
Added only for Compilation.


Signed-off-by: Riana Tauro <[email protected]>
Signed-off-by: Mallesh Koujalagi <[email protected]>
---
 drivers/gpu/drm/xe/Makefile                   |   1 +
 drivers/gpu/drm/xe/xe_device.c                |  24 +-
 drivers/gpu/drm/xe/xe_device.h                |  27 +-
 drivers/gpu/drm/xe/xe_device_types.h          |  12 +-
 drivers/gpu/drm/xe/xe_gt.c                    |  14 +-
 drivers/gpu/drm/xe/xe_guc_submit.c            |   9 +-
 drivers/gpu/drm/xe/xe_pci.c                   |   9 +
 drivers/gpu/drm/xe/xe_pci_error.c             | 135 +++++
 drivers/gpu/drm/xe/xe_pci_error.h             |  13 +
 drivers/gpu/drm/xe/xe_ras.c                   | 475 ++++++++++++++++++
 drivers/gpu/drm/xe/xe_ras.h                   |   4 +
 drivers/gpu/drm/xe/xe_ras_types.h             | 215 ++++++++
 drivers/gpu/drm/xe/xe_survivability_mode.c    |  13 +-
 drivers/gpu/drm/xe/xe_sysctrl_event.c         |   2 +-
 drivers/gpu/drm/xe/xe_sysctrl_event_types.h   |   3 -
 drivers/gpu/drm/xe/xe_sysctrl_mailbox.c       |  28 ++
 drivers/gpu/drm/xe/xe_sysctrl_mailbox.h       |   4 +-
 drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h |  11 +
 18 files changed, 971 insertions(+), 28 deletions(-)
 create mode 100644 drivers/gpu/drm/xe/xe_pci_error.c
 create mode 100644 drivers/gpu/drm/xe/xe_pci_error.h

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 09661f079d03..091872771e98 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -101,6 +101,7 @@ xe-y += xe_bb.o \
        xe_page_reclaim.o \
        xe_pat.o \
        xe_pci.o \
+       xe_pci_error.o \
        xe_pci_rebar.o \
        xe_pcode.o \
        xe_pm.o \
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 51e3a2dd7b22..7ee2148f1321 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -61,6 +61,7 @@
 #include "xe_psmi.h"
 #include "xe_pxp.h"
 #include "xe_query.h"
+#include "xe_ras.h"
 #include "xe_shrinker.h"
 #include "xe_soc_remapper.h"
 #include "xe_survivability_mode.h"
@@ -915,7 +916,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, 
void *arg)
 {
        struct xe_device *xe = arg;
 
-       if (atomic_read(&xe->wedged.flag))
+       if (atomic_read(&xe->wedged.fini))
                xe_pm_runtime_put(xe);
 }
 
@@ -988,6 +989,16 @@ int xe_device_probe(struct xe_device *xe)
        if (err)
                return err;
 
+       err = xe_soc_remapper_init(xe);
+       if (err)
+               return err;
+
+       err = xe_sysctrl_init(xe);
+       if (err)
+               return err;
+
+       xe_ras_init(xe);
+
        /*
         * Now that GT is initialized (TTM in particular),
         * we can try to init display, and inherit the initial fb.
@@ -1028,10 +1039,6 @@ int xe_device_probe(struct xe_device *xe)
 
        xe_nvm_init(xe);
 
-       err = xe_soc_remapper_init(xe);
-       if (err)
-               return err;
-
        err = xe_heci_gsc_init(xe);
        if (err)
                return err;
@@ -1070,10 +1077,6 @@ int xe_device_probe(struct xe_device *xe)
        if (err)
                goto err_unregister_display;
 
-       err = xe_sysctrl_init(xe);
-       if (err)
-               goto err_unregister_display;
-
        err = xe_device_sysfs_init(xe);
        if (err)
                goto err_unregister_display;
@@ -1411,7 +1414,8 @@ void xe_device_declare_wedged(struct xe_device *xe)
                return;
        }
 
-       if (!atomic_xchg(&xe->wedged.flag, 1)) {
+       if (!atomic_xchg(&xe->wedged.fini, 1)) {
+               xe_device_wedged_get(xe);
                xe->needs_flr_on_fini = true;
                xe_pm_runtime_get_noresume(xe);
                drm_err(&xe->drm,
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 975768a6a9c8..e177c05a7a95 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -181,6 +181,21 @@ static inline bool xe_device_has_mert(const struct 
xe_device *xe)
        return xe->info.has_mert;
 }
 
+static inline bool xe_device_is_in_reset(struct xe_device *xe)
+{
+       return atomic_read(&xe->in_reset);
+}
+
+static inline void xe_device_set_in_reset(struct xe_device *xe)
+{
+       atomic_set(&xe->in_reset, 1);
+}
+
+static inline void xe_device_clear_in_reset(struct xe_device *xe)
+{
+       atomic_set(&xe->in_reset, 0);
+}
+
 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
 
 void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
@@ -192,9 +207,19 @@ bool xe_device_is_l2_flush_optimized(struct xe_device *xe);
 void xe_device_td_flush(struct xe_device *xe);
 void xe_device_l2_flush(struct xe_device *xe);
 
+static inline void xe_device_wedged_get(struct xe_device *xe)
+{
+       atomic_inc(&xe->wedged.ref);
+}
+
+static inline void xe_device_wedged_put(struct xe_device *xe)
+{
+       atomic_dec(&xe->wedged.ref);
+}
+
 static inline bool xe_device_wedged(struct xe_device *xe)
 {
-       return atomic_read(&xe->wedged.flag);
+       return atomic_read(&xe->wedged.ref);
 }
 
 void xe_device_set_wedged_method(struct xe_device *xe, unsigned long method);
diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 32dd2ffbc796..bf43a3277d1e 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -483,10 +483,15 @@ struct xe_device {
        /** @needs_flr_on_fini: requests function-reset on fini */
        bool needs_flr_on_fini;
 
+       /** @in_reset: Indicates if device is in reset */
+       atomic_t in_reset;
+
        /** @wedged: Struct to control Wedged States and mode */
        struct {
-               /** @wedged.flag: Xe device faced a critical error and is now 
blocked. */
-               atomic_t flag;
+               /** @wedged.fini: Needs cleanup on fini */
+               atomic_t fini;
+               /** @wedged.ref: Refcount for wedged device, blocks critical 
path execution */
+               atomic_t ref;
                /** @wedged.mode: Mode controlled by kernel parameter and 
debugfs */
                enum xe_wedged_mode mode;
                /** @wedged.method: Recovery method to be sent in the drm 
device wedged uevent */
@@ -495,6 +500,9 @@ struct xe_device {
                bool inconsistent_reset;
        } wedged;
 
+       /** @devres_group_id: id for devres group */
+       void *devres_group_id;
+
        /** @bo_device: Struct to control async free of BOs */
        struct xe_bo_dev {
                /** @bo_device.async_free: Free worker */
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 783eb6d631b5..d904527a8898 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -917,6 +917,9 @@ static void gt_reset_worker(struct work_struct *w)
        if (xe_device_wedged(gt_to_xe(gt)))
                goto err_pm_put;
 
+       if (xe_device_is_in_reset(gt_to_xe(gt)))
+               goto err_pm_put;
+
        /* We only support GT resets with GuC submission */
        if (!xe_device_uc_enabled(gt_to_xe(gt)))
                goto err_pm_put;
@@ -977,18 +980,21 @@ static void gt_reset_worker(struct work_struct *w)
 
 void xe_gt_reset_async(struct xe_gt *gt)
 {
-       xe_gt_info(gt, "trying reset from %ps\n", __builtin_return_address(0));
+       struct xe_device *xe = gt_to_xe(gt);
+
+       if (xe_device_is_in_reset(xe))
+               return;
 
        /* Don't do a reset while one is already in flight */
        if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(&gt->uc))
                return;
 
-       xe_gt_info(gt, "reset queued\n");
+       xe_gt_info(gt, "reset queued from %ps\n", __builtin_return_address(0));
 
        /* Pair with put in gt_reset_worker() if work is enqueued */
-       xe_pm_runtime_get_noresume(gt_to_xe(gt));
+       xe_pm_runtime_get_noresume(xe);
        if (!queue_work(gt->ordered_wq, &gt->reset.worker))
-               xe_pm_runtime_put(gt_to_xe(gt));
+               xe_pm_runtime_put(xe);
 }
 
 void xe_gt_suspend_prepare(struct xe_gt *gt)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c 
b/drivers/gpu/drm/xe/xe_guc_submit.c
index 4b247a3019d2..b2870d7ab8ce 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1511,7 +1511,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
         * If devcoredump not captured and GuC capture for the job is not ready
         * do manual capture first and decide later if we need to use it
         */
-       if (!exec_queue_killed(q) && !xe->devcoredump.captured &&
+       if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q) && 
!xe->devcoredump.captured &&
            !xe_guc_capture_get_matching_and_lock(q)) {
                /* take force wake before engine register manual capture */
                CLASS(xe_force_wake, fw_ref)(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
@@ -1533,8 +1533,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
        set_exec_queue_banned(q);
 
        /* Kick job / queue off hardware */
-       if (!wedged && (exec_queue_enabled(primary) ||
-                       exec_queue_pending_disable(primary))) {
+       if (!xe_device_is_in_reset(xe) && !wedged &&
+           (exec_queue_enabled(primary) || 
exec_queue_pending_disable(primary))) {
                int ret;
 
                if (exec_queue_reset(primary))
@@ -1602,7 +1602,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 
        trace_xe_sched_job_timedout(job);
 
-       if (!exec_queue_killed(q))
+       /* Do not access device if in reset */
+       if (!xe_device_is_in_reset(xe) && !exec_queue_killed(q))
                xe_devcoredump(q, job,
                               "Timedout job - seqno=%u, lrc_seqno=%u, 
guc_id=%d, flags=0x%lx",
                               xe_sched_job_seqno(job), 
xe_sched_job_lrc_seqno(job),
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 78fc2e4dcfc6..ab74a5852dbd 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -26,6 +26,7 @@
 #include "xe_guc.h"
 #include "xe_mmio.h"
 #include "xe_module.h"
+#include "xe_pci_error.h"
 #include "xe_pci_rebar.h"
 #include "xe_pci_sriov.h"
 #include "xe_pci_types.h"
@@ -1076,6 +1077,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
        const struct xe_device_desc *desc = (const void *)ent->driver_data;
        const struct xe_subplatform_desc *subplatform_desc;
        struct xe_device *xe;
+       void *devres_id;
        int err;
 
        subplatform_desc = find_subplatform(desc, pdev->device);
@@ -1103,6 +1105,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
        if (xe_display_driver_probe_defer(pdev))
                return -EPROBE_DEFER;
 
+       devres_id = devres_open_group(&pdev->dev, NULL, GFP_KERNEL);
+       if (!devres_id)
+               return -ENOMEM;
+
        err = pcim_enable_device(pdev);
        if (err)
                return err;
@@ -1111,6 +1117,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
        if (IS_ERR(xe))
                return PTR_ERR(xe);
 
+       xe->devres_group_id = devres_id;
+
        pci_set_drvdata(pdev, &xe->drm);
 
        xe_pm_assert_unbounded_bridge(xe);
@@ -1349,6 +1357,7 @@ static struct pci_driver xe_pci_driver = {
        .remove = xe_pci_remove,
        .shutdown = xe_pci_shutdown,
        .sriov_configure = xe_pci_sriov_configure,
+       .err_handler = &xe_pci_error_handlers,
 #ifdef CONFIG_PM_SLEEP
        .driver.pm = &xe_pm_ops,
 #endif
diff --git a/drivers/gpu/drm/xe/xe_pci_error.c 
b/drivers/gpu/drm/xe/xe_pci_error.c
new file mode 100644
index 000000000000..b08601f470d6
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_pci_error.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#include <linux/pci.h>
+
+#include "xe_device.h"
+#include "xe_gt.h"
+#include "xe_pci.h"
+#include "xe_printk.h"
+#include "xe_ras.h"
+#include "xe_survivability_mode.h"
+
+static void prepare_device_for_reset(struct pci_dev *pdev)
+{
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+       struct xe_gt *gt;
+       u8 id;
+
+       xe_device_set_in_reset(xe);
+
+       /* Wedge the device to prevent userspace access during reset */
+       xe_device_wedged_get(xe);
+
+       for_each_gt(gt, xe, id)
+               xe_gt_declare_wedged(gt);
+
+       pci_disable_device(pdev);
+}
+
+static pci_ers_result_t ras_action_to_pci_result(struct pci_dev *pdev, u32 
action)
+{
+       switch (action) {
+       case XE_RAS_RECOVERY_ACTION_RECOVERED:
+               return PCI_ERS_RESULT_RECOVERED;
+       case XE_RAS_RECOVERY_ACTION_RESET:
+               prepare_device_for_reset(pdev);
+               return PCI_ERS_RESULT_NEED_RESET;
+       case XE_RAS_RECOVERY_ACTION_DISCONNECT:
+               return PCI_ERS_RESULT_DISCONNECT;
+       default:
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+}
+
+static pci_ers_result_t xe_pci_error_detected(struct pci_dev *pdev, 
pci_channel_state_t state)
+{
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+
+       xe_err(xe, "PCI error: detected state = %u\n", state);
+
+       if (state == pci_channel_io_perm_failure)
+               return PCI_ERS_RESULT_DISCONNECT;
+
+       /* If the device is already wedged or in survivability mode, do not 
attempt recovery */
+       if (xe_survivability_mode_is_boot_enabled(xe) || xe_device_wedged(xe))
+               return PCI_ERS_RESULT_DISCONNECT;
+
+       switch (state) {
+       case pci_channel_io_normal:
+               return PCI_ERS_RESULT_CAN_RECOVER;
+       case pci_channel_io_frozen:
+               prepare_device_for_reset(pdev);
+               return PCI_ERS_RESULT_NEED_RESET;
+       default:
+               xe_err(xe, "PCI error: unknown state %d\n", state);
+               return PCI_ERS_RESULT_NEED_RESET;
+       }
+}
+
+static pci_ers_result_t xe_pci_error_mmio_enabled(struct pci_dev *pdev)
+{
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+       enum xe_ras_recovery_action action;
+
+       xe_err(xe, "PCI error: MMIO enabled\n");
+
+       action = xe_ras_process_errors(xe);
+
+       return ras_action_to_pci_result(pdev, action);
+}
+
+static pci_ers_result_t xe_pci_error_slot_reset(struct pci_dev *pdev)
+{
+       const struct pci_device_id *ent = pci_match_id(pdev->driver->id_table, 
pdev);
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+
+       xe_err(xe, "PCI error: slot reset\n");
+
+       pci_restore_state(pdev);
+
+       if (pci_enable_device(pdev)) {
+               xe_err(xe, "Cannot re-enable PCI device after reset\n");
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+
+       /*
+        * Secondary Bus Reset causes all VRAM state to be lost along with
+        * hardware state. As an initial step, re-probe the device to
+        * re-initialize the driver and hardware.
+        * TODO: optimize by re-initializing only the hardware state and 
re-creating
+        * kernel BOs.
+        */
+       xe_device_clear_in_reset(xe);
+       pdev->driver->remove(pdev);
+       devres_release_group(&pdev->dev, xe->devres_group_id);
+
+       if (pdev->driver->probe(pdev, ent))
+               return PCI_ERS_RESULT_DISCONNECT;
+
+       xe = pdev_to_xe_device(pdev);
+
+       /* Wedge the device to prevent I/O operations till the resume callback 
*/
+       xe_device_wedged_get(xe);
+
+       return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void xe_pci_error_resume(struct pci_dev *pdev)
+{
+       struct xe_device *xe = pdev_to_xe_device(pdev);
+
+       xe_err(xe, "PCI error: resume\n");
+
+       /* Resume I/O operations */
+       xe_device_wedged_put(xe);
+}
+
+const struct pci_error_handlers xe_pci_error_handlers = {
+       .error_detected = xe_pci_error_detected,
+       .mmio_enabled   = xe_pci_error_mmio_enabled,
+       .slot_reset     = xe_pci_error_slot_reset,
+       .resume         = xe_pci_error_resume,
+};
diff --git a/drivers/gpu/drm/xe/xe_pci_error.h 
b/drivers/gpu/drm/xe/xe_pci_error.h
new file mode 100644
index 000000000000..725ad0214e62
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_pci_error.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_PCI_ERROR_H_
+#define _XE_PCI_ERROR_H_
+
+struct pci_error_handlers;
+
+extern const struct pci_error_handlers xe_pci_error_handlers;
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 4cb16b419b0c..1f3c4a5f39d9 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -3,12 +3,19 @@
  * Copyright © 2026 Intel Corporation
  */
 
+#include "xe_bo.h"
 #include "xe_device.h"
 #include "xe_printk.h"
 #include "xe_ras.h"
 #include "xe_ras_types.h"
+#include "xe_survivability_mode.h"
 #include "xe_sysctrl.h"
 #include "xe_sysctrl_event_types.h"
+#include "xe_sysctrl_mailbox.h"
+#include "xe_sysctrl_mailbox_types.h"
+
+#define CORE_COMPUTE_UNCORR_TYPE       GENMASK(26, 25)
+#define  GLOBAL_UNCORR_ERROR           2
 
 /* Severity of detected errors  */
 enum xe_ras_severity {
@@ -66,6 +73,290 @@ static inline const char *comp_to_str(u8 component)
        return xe_ras_components[component];
 }
 
+static int ras_status_to_errno(u32 status)
+{
+       switch (status) {
+       case XE_RAS_STATUS_SUCCESS:
+               return 0;
+       case XE_RAS_STATUS_INVALID_PARAM:
+               return -EINVAL;
+       case XE_RAS_STATUS_OP_NOT_SUPPORTED:
+               return -EOPNOTSUPP;
+       case XE_RAS_STATUS_TIMEOUT:
+               return -ETIMEDOUT;
+       case XE_RAS_STATUS_HARDWARE_FAILURE:
+               return -EIO;
+       case XE_RAS_STATUS_INSUFFICIENT_RESOURCES:
+               return -ENOSPC;
+       default:
+               return -EPROTO;
+       }
+}
+
+static int send_page_offline(struct xe_device *xe, enum xe_ras_page_action 
action, u64 page_address)
+{
+       struct xe_sysctrl_mailbox_command command = {0};
+       struct xe_ras_page_offline_request request = {0};
+       struct xe_ras_page_offline_response response = {0};
+       size_t rlen;
+       int ret;
+
+       if (!xe->info.has_sysctrl)
+               return 0;
+
+       if (action >= XE_RAS_PAGE_ACTION_MAX) {
+               xe_err(xe, "[RAS]: Invalid page offline action %d\n", action);
+               return -EINVAL;
+       }
+
+       request.page_address = page_address;
+       request.action = action;
+
+       xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, 
XE_SYSCTRL_CMD_PAGE_OFFLINE,
+                                 &request, sizeof(request), &response, 
sizeof(response));
+
+       ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
+       if (ret) {
+               xe_err(xe, "sysctrl: failed to send page offline command %d\n", 
ret);
+               return ret;
+       }
+
+       if (rlen != sizeof(response)) {
+               xe_err(xe, "sysctrl: unexpected page offline response length 
%zu (expected %zu)\n",
+                      rlen, sizeof(response));
+               return -EINVAL;
+       }
+
+       ret = ras_status_to_errno(response.status);
+       if (ret) {
+               xe_err(xe, "sysctrl: page offline command failed with status 
%d\n",
+                      response.status);
+       }
+
+       return ret;
+}
+
+static int handle_page_offline(struct xe_device *xe, u64 page_address, bool 
send_offline_cmd)
+{
+       enum xe_ras_page_action action;
+       int ret = 0;
+
+       if (!IS_ALIGNED(page_address, XE_PAGE_SIZE)) {
+               xe_err(xe, "sysctrl: Unaligned page address: 0x%llx\n", 
page_address);
+               return -EINVAL;
+       }
+
+       /*
+        * TODO: Call function to handle address fault
+        * ret = xe_ttm_vram_handle_addr_fault(xe, page_address);
+        */
+
+       /*
+        * Handle return code from address fault handling function:
+        *  0: Address is valid and can be offlined
+        * -EIO: Address belongs to a critical BO that cannot be offlined
+        * -EOPNOTSUPP: Address is valid and can be offlined but user policy is 
not to offline
+        *
+        * For any other non-zero error code, skip offlining.
+        */
+
+       switch (ret) {
+       case 0:
+               action = XE_RAS_PAGE_ACTION_OFFLINE;
+               break;
+       /* User policy set to decline page offlining */
+       case -EOPNOTSUPP:
+               action = XE_RAS_PAGE_ACTION_DECLINE;
+               break;
+       case -EIO:
+               xe_err(xe, "[RAS]: Page address belongs to critical BO: 
0x%llx\n",
+                      page_address);
+               return ret;
+       default:
+               xe_err(xe, "[RAS]: Failed to handle address fault 0x%llx: %d\n",
+                      page_address, ret);
+               return 0;
+       }
+
+       if (send_offline_cmd) {
+               ret = send_page_offline(xe, action, page_address);
+               if (ret)
+                       xe_err(xe, "sysctrl: Failed to offline page for address 
0x%llx: %d\n",
+                              page_address, ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static enum xe_ras_recovery_action handle_core_compute_errors(struct 
xe_ras_error_array *arr)
+{
+       struct xe_ras_compute_error *error_info = (void *)arr->details;
+       u8 uncorr_type;
+
+       uncorr_type = FIELD_GET(CORE_COMPUTE_UNCORR_TYPE, 
error_info->log_header);
+
+       /* Request a reset if error is global */
+       if (uncorr_type == GLOBAL_UNCORR_ERROR)
+               return XE_RAS_RECOVERY_ACTION_RESET;
+
+       /*
+        * No action needed for other errors.
+        * Local errors are recovered using an engine reset by GuC.
+        */
+       return XE_RAS_RECOVERY_ACTION_RECOVERED;
+}
+
+static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device 
*xe,
+                                                             struct 
xe_ras_error_array *arr)
+{
+       struct xe_ras_soc_error *info = (void *)arr->details;
+       struct xe_ras_soc_error_source *source = &info->source;
+       struct xe_ras_error_class *counter = &arr->counter;
+
+       if (source->csc) {
+               struct xe_ras_csc_error *csc_error = (void *)info->details;
+
+               /*
+                * CSC uncorrectable errors are classified as hardware errors 
and firmware errors.
+                * CSC firmware errors are critical errors that can be 
recovered only by firmware
+                * update via SPI driver. On a CSC firmware error, PCODE 
enables FDO mode and sets
+                * the bit in the capability register. On receiving this error, 
the driver enables
+                * runtime survivability mode which notifies userspace that a 
firmware update
+                * is required.
+                */
+               if (csc_error->hec_fw_error) {
+                       xe_err(xe, "[RAS]: CSC %s detected: 0x%x\n",
+                              sev_to_str(counter->common.severity),
+                              csc_error->hec_fw_error);
+                       xe_survivability_mode_runtime_enable(xe);
+                       return XE_RAS_RECOVERY_ACTION_DISCONNECT;
+               }
+       } else if (source->ieh) {
+               struct xe_ras_ieh_error *ieh_error = (void *)info->details;
+
+               if (ieh_error->global_error_status & XE_RAS_SOC_IEH_PUNIT) {
+                       xe_err(xe, "[RAS]: PUNIT %s detected: 0x%x\n",
+                              sev_to_str(counter->common.severity),
+                              ieh_error->global_error_status);
+                       /* TODO: Add PUNIT error handling */
+                       return XE_RAS_RECOVERY_ACTION_DISCONNECT;
+               }
+       }
+
+       /* For other SOC internal errors, request a reset as recovery mechanism 
*/
+       return XE_RAS_RECOVERY_ACTION_RESET;
+}
+
+static enum xe_ras_recovery_action handle_device_memory_errors(struct 
xe_device *xe,
+                                                              struct 
xe_ras_error_array *arr)
+{
+       struct xe_ras_memory_error *info = (void *)arr->details;
+       int ret;
+
+       if (info->category & XE_RAS_MEMORY_ECC) {
+               xe_err(xe, "[RAS]: double-bit ECC error detected at sw address 
0x%llx\n",
+                      info->sw_address);
+               ret = handle_page_offline(xe, info->sw_address, true);
+               if (!ret)
+                       return XE_RAS_RECOVERY_ACTION_RECOVERED;
+       }
+
+       /* Request a reset for other device memory errors and if page offlining 
failed */
+       return XE_RAS_RECOVERY_ACTION_RESET;
+}
+
+static void get_queued_pages(struct xe_device *xe)
+{
+       struct xe_sysctrl_mailbox_command command = {0};
+       struct xe_ras_page_offline_queue response = {0};
+       u32 count = 0;
+       size_t rlen;
+       int ret, i;
+
+       /* Supported only on platforms with system controller */
+       if (!xe->info.has_sysctrl)
+               return;
+
+       xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP,
+                                 XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE, NULL, 0, 
&response,
+                                 sizeof(response));
+
+       do {
+               memset(&response, 0, sizeof(response));
+
+               ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
+               if (ret) {
+                       xe_err(xe, "sysctrl: failed to get page offline queue 
%d\n", ret);
+                       return;
+               }
+
+               if (rlen != sizeof(response)) {
+                       xe_err(xe, "sysctrl: unexpected page offline queue 
response length %zu (expected %zu)\n",
+                              rlen, sizeof(response));
+                       return;
+               }
+
+               for (i = 0; i < response.pages_returned && i < 
XE_RAS_NUM_PAGES; i++)
+                       handle_page_offline(xe, response.page_addresses[i], 
true);
+
+               count += response.pages_returned;
+               if (!response.pages_returned)
+                       break;
+
+               if (count > response.total_pages) {
+                       xe_err(xe, "sysctrl: Pages returned from queue exceed 
total pages %u, returned %u\n",
+                              response.total_pages, count);
+                       return;
+               }
+       } while (response.additional_data);
+}
+
+static void get_offlined_list(struct xe_device *xe)
+{
+       struct xe_sysctrl_mailbox_command command = {0};
+       struct xe_ras_page_offline_list response = {0};
+       u32 count = 0;
+       size_t rlen;
+       int ret, i;
+
+       /* Supported only on platforms with system controller */
+       if (!xe->info.has_sysctrl)
+               return;
+
+       xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, 
XE_SYSCTRL_CMD_GET_OFFLINE_LIST,
+                                 NULL, 0, &response, sizeof(response));
+
+       do {
+               memset(&response, 0, sizeof(response));
+
+               ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
+               if (ret) {
+                       xe_err(xe, "sysctrl: failed to get page offline list 
%d\n", ret);
+                       return;
+               }
+
+               if (rlen != sizeof(response)) {
+                       xe_err(xe, "sysctrl: unexpected page offline list 
response length %zu (expected %zu)\n",
+                              rlen, sizeof(response));
+                       return;
+               }
+
+               for (i = 0; i < response.pages_returned && i < 
XE_RAS_NUM_PAGES; i++)
+                       handle_page_offline(xe, response.page_addresses[i], 
false);
+
+               count += response.pages_returned;
+               if (!response.pages_returned)
+                       break;
+
+               if (count > response.total_pages) {
+                       xe_err(xe, "sysctrl: Pages returned from list exceed 
total pages %u, returned %u\n",
+                              response.total_pages, count);
+                       return;
+               }
+       } while (response.additional_data);
+}
+
 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
                                      struct xe_sysctrl_event_response 
*response)
 {
@@ -91,3 +382,187 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe,
                        comp_to_str(component), sev_to_str(severity));
        }
 }
+
+/**
+ * xe_ras_process_errors() - Process and contain hardware errors
+ * @xe: xe device instance
+ *
+ * Get error details from system controller and return recovery
+ * method. Called only from PCI error handling.
+ *
+ * Returns: recovery action to be taken
+ */
+enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe)
+{
+       struct xe_sysctrl_mailbox_command command = {0};
+       struct xe_ras_get_soc_error response;
+       enum xe_ras_recovery_action final_action;
+       u32 remaining = XE_SYSCTRL_FLOOD_LIMIT;
+       size_t rlen;
+       int ret;
+
+       if (!xe->info.has_sysctrl)
+               return XE_RAS_RECOVERY_ACTION_RESET;
+
+       /* Default action */
+       final_action = XE_RAS_RECOVERY_ACTION_RECOVERED;
+
+       xe_sysctrl_create_command(&command, XE_SYSCTRL_GROUP_GFSP, 
XE_SYSCTRL_CMD_GET_SOC_ERROR,
+                                 NULL, 0, &response, sizeof(response));
+
+       do {
+               memset(&response, 0, sizeof(response));
+
+               ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
+               if (ret) {
+                       xe_err(xe, "sysctrl: failed to get soc error %d\n", 
ret);
+                       goto err;
+               }
+
+               if (rlen != sizeof(response)) {
+                       xe_err(xe, "sysctrl: unexpected get soc error response 
length %zu (expected %zu)\n",
+                              rlen, sizeof(response));
+                       goto err;
+               }
+
+               /* Report if number of errors exceeds the maximum errors 
supported */
+               if (response.num_errors > XE_RAS_NUM_ERROR_ARR)
+                       xe_err(xe, "sysctrl: number of errors received %d out 
of bound (%d)\n",
+                              response.num_errors, XE_RAS_NUM_ERROR_ARR);
+
+               for (int i = 0; i < response.num_errors && i < 
XE_RAS_NUM_ERROR_ARR; i++) {
+                       struct xe_ras_error_array *arr = &response.arr[i];
+                       enum xe_ras_recovery_action action;
+                       u8 component, severity;
+
+                       component = arr->counter.common.component;
+                       severity = arr->counter.common.severity;
+
+                       xe_err(xe, "[RAS]: %s %s detected\n", 
comp_to_str(component),
+                              sev_to_str(severity));
+
+                       switch (component) {
+                       case XE_RAS_COMP_CORE_COMPUTE:
+                               action = handle_core_compute_errors(arr);
+                               break;
+                       case XE_RAS_COMP_SOC_INTERNAL:
+                               action = handle_soc_internal_errors(xe, arr);
+                               break;
+                       case XE_RAS_COMP_DEVICE_MEMORY:
+                               action = handle_device_memory_errors(xe, arr);
+                               break;
+                       default:
+                               /* For any other component, reset */
+                               action = XE_RAS_RECOVERY_ACTION_RESET;
+                               break;
+                       }
+
+                       /* Process and log all errors and then trigger highest 
recovery action */
+                       if (action > final_action)
+                               final_action = action;
+               }
+
+               /* Treat flooding as an system controller error */
+               if (!--remaining) {
+                       xe_err(xe, "[RAS]: sysctrl: get soc error response 
flooding\n");
+                       return XE_RAS_RECOVERY_ACTION_RESET;
+               }
+
+       } while (response.additional_errors);
+
+       return final_action;
+
+err:
+       return XE_RAS_RECOVERY_ACTION_RESET;
+}
+
+static struct pci_dev *find_usp_dev(struct pci_dev *pdev)
+{
+       struct pci_dev *vsp;
+
+       /*
+        * Device Hierarchy:
+        *
+        * Upstream Switch Port (USP) --> Virtual Switch Port (VSP) --> SGunit 
(GPU endpoint)
+        */
+       vsp = pci_upstream_bridge(pdev);
+       if (!vsp)
+               return NULL;
+
+       return pci_upstream_bridge(vsp);
+}
+
+static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe)
+{
+       struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+       u32 aer_uncorr_mask, aer_uncorr_sev, aer_uncorr_status;
+       struct pci_dev *usp;
+       u16 aer_cap;
+
+       usp = find_usp_dev(pdev);
+       if (!usp)
+               return;
+
+       aer_cap = usp->aer_cap;
+       if (!aer_cap) {
+               dev_info(&usp->dev, "AER capability unavailable\n");
+               return;
+       }
+
+       /*
+        * Clear any stale Uncorrectable Internal Error Status event in 
Uncorrectable Error
+        * Status Register.
+        */
+       pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, 
&aer_uncorr_status);
+       if (aer_uncorr_status & PCI_ERR_UNC_INTN)
+               pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, 
PCI_ERR_UNC_INTN);
+
+       /*
+        * All errors are steered to USP which is a PCIe AER Compliant device.
+        * Downgrade all the errors to non-fatal to prevent PCIe bus driver
+        * from triggering a Secondary Bus Reset (SBR). This allows error
+        * detection, containment and recovery in the driver.
+        *
+        * The Uncorrectable Error Severity Register has the 'Uncorrectable
+        * Internal Error Severity' set to fatal by default. Set this to
+        * non-fatal and unmask the error.
+        */
+
+       /* Initialize Uncorrectable Error Severity Register */
+       pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, 
&aer_uncorr_sev);
+       aer_uncorr_sev &= ~PCI_ERR_UNC_INTN;
+       pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, 
aer_uncorr_sev);
+
+       /* Initialize Uncorrectable Error Mask Register */
+       pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 
&aer_uncorr_mask);
+       aer_uncorr_mask &= ~PCI_ERR_UNC_INTN;
+       pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 
aer_uncorr_mask);
+
+       pci_save_state(usp);
+       dev_dbg(&usp->dev, "Uncorrectable Internal Errors downgraded and 
unmasked\n");
+}
+
+/**
+ * xe_ras_init - Initialize Xe RAS
+ * @xe: xe device instance
+ *
+ * Initialize Xe RAS
+ */
+void xe_ras_init(struct xe_device *xe)
+{
+       if (!xe->info.has_sysctrl)
+               return;
+
+       if (IS_ENABLED(CONFIG_PCIEAER))
+               aer_unmask_and_downgrade_internal_error(xe);
+
+       get_queued_pages(xe);
+       get_offlined_list(xe);
+
+       /*
+        * During probe, process and log any errors detected by firmware while 
the driver was not
+        * loaded. Critical errors such as Punit and CSC are reported through 
Pcode init failure,
+        * causing the driver to enter survivability mode.
+        */
+       xe_ras_process_errors(xe);
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
index ea90593b62dc..8d106c708ff1 100644
--- a/drivers/gpu/drm/xe/xe_ras.h
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -6,10 +6,14 @@
 #ifndef _XE_RAS_H_
 #define _XE_RAS_H_
 
+#include "xe_ras_types.h"
+
 struct xe_device;
 struct xe_sysctrl_event_response;
 
 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
                                      struct xe_sysctrl_event_response 
*response);
+void xe_ras_init(struct xe_device *xe);
+enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_ras_types.h 
b/drivers/gpu/drm/xe/xe_ras_types.h
index 4e63c67f806a..af03921c7cf6 100644
--- a/drivers/gpu/drm/xe/xe_ras_types.h
+++ b/drivers/gpu/drm/xe/xe_ras_types.h
@@ -8,7 +8,63 @@
 
 #include <linux/types.h>
 
+#define XE_RAS_NUM_ERROR_ARR                   3
 #define XE_RAS_NUM_COUNTERS                    16
+#define XE_RAS_SOC_IEH_PUNIT                   BIT(1)
+#define XE_RAS_MEMORY_ECC                      BIT(1)
+#define XE_RAS_NUM_PAGES                       25
+
+/**
+ * enum xe_ras_recovery_action - RAS recovery actions
+ *
+ * @XE_RAS_RECOVERY_ACTION_RECOVERED: Error recovered
+ * @XE_RAS_RECOVERY_ACTION_RESET: Requires reset
+ * @XE_RAS_RECOVERY_ACTION_DISCONNECT: Requires disconnect
+ * @XE_RAS_RECOVERY_ACTION_MAX: Max action value
+ *
+ * This enum defines the possible recovery actions that can be taken in 
response
+ * to RAS errors.
+ */
+enum xe_ras_recovery_action {
+       XE_RAS_RECOVERY_ACTION_RECOVERED = 0,
+       XE_RAS_RECOVERY_ACTION_RESET,
+       XE_RAS_RECOVERY_ACTION_DISCONNECT,
+       XE_RAS_RECOVERY_ACTION_MAX
+};
+
+/**
+ * enum xe_ras_page_action - Page offline actions for page offline request
+ *
+ * @XE_RAS_PAGE_ACTION_OFFLINE: Instruct firmware to remove page from queue
+ * @XE_RAS_PAGE_ACTION_DECLINE: Instruct firmware to mark page as not offline
+ * @XE_RAS_PAGE_ACTION_MAX: Max value for validation
+ */
+enum xe_ras_page_action {
+       XE_RAS_PAGE_ACTION_OFFLINE,
+       XE_RAS_PAGE_ACTION_DECLINE,
+       XE_RAS_PAGE_ACTION_MAX
+};
+
+/**
+ * enum xe_ras_response_status - RAS response status codes
+ *
+ * @XE_RAS_STATUS_SUCCESS: Operation successful
+ * @XE_RAS_STATUS_INVALID_PARAM: Invalid parameter
+ * @XE_RAS_STATUS_OP_NOT_SUPPORTED: Operation not supported
+ * @XE_RAS_STATUS_TIMEOUT: Operation timed out
+ * @XE_RAS_STATUS_HARDWARE_FAILURE: Hardware failure
+ * @XE_RAS_STATUS_INSUFFICIENT_RESOURCES: Insufficient resources
+ * @XE_RAS_STATUS_UNKNOWN_ERROR: Unknown error
+ */
+enum xe_ras_response_status {
+       XE_RAS_STATUS_SUCCESS = 0,
+       XE_RAS_STATUS_INVALID_PARAM,
+       XE_RAS_STATUS_OP_NOT_SUPPORTED,
+       XE_RAS_STATUS_TIMEOUT,
+       XE_RAS_STATUS_HARDWARE_FAILURE,
+       XE_RAS_STATUS_INSUFFICIENT_RESOURCES,
+       XE_RAS_STATUS_UNKNOWN_ERROR
+};
 
 /**
  * struct xe_ras_error_common - Error fields that are common across all 
products
@@ -70,4 +126,163 @@ struct xe_ras_threshold_crossed {
        struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS];
 } __packed;
 
+/**
+ * struct xe_ras_error_array - Details of the error types
+ */
+struct xe_ras_error_array {
+       /** @counter_value: Counter value of the returned error */
+       u32 counter_value;
+       /** @counter: Error counter */
+       struct xe_ras_error_class counter;
+       /** @timestamp: Timestamp */
+       u64 timestamp;
+       /** @details: Error details specific to the counter */
+       u32 details[XE_RAS_NUM_COUNTERS];
+} __packed;
+
+/**
+ * struct xe_ras_get_soc_error - Response from get soc error command
+ */
+struct xe_ras_get_soc_error {
+       /** @num_errors: Number of errors reported in this response */
+       u8 num_errors;
+       /** @additional_errors: Indicates if the errors are pending */
+       u8 additional_errors;
+       /** @arr: Array of up to 3 errors */
+       struct xe_ras_error_array arr[XE_RAS_NUM_ERROR_ARR];
+} __packed;
+
+/**
+ * struct xe_ras_compute_error - Error details of Core Compute error
+ */
+struct xe_ras_compute_error {
+       /** @log_header: Error Source and type */
+       u32 log_header;
+       /** @reserved: Reserved */
+       u32 reserved[15];
+} __packed;
+
+/**
+ * struct xe_ras_soc_error_source - Source of SoC error
+ */
+struct xe_ras_soc_error_source {
+       /** @csc: CSC */
+       u32 csc:1;
+       /** @ieh: IEH (Integrated Error Handler) */
+       u32 ieh:1;
+       /** @reserved: Reserved for future use */
+       u32 reserved:30;
+} __packed;
+
+/**
+ * struct xe_ras_soc_error - Error details of SoC internal error
+ */
+struct xe_ras_soc_error {
+       /** @source: Error source */
+       struct xe_ras_soc_error_source source;
+       /** @details: Error details specific to the error source */
+       u32 details[15];
+} __packed;
+
+/**
+ * struct xe_ras_csc_error - CSC error details
+ */
+struct xe_ras_csc_error {
+       /** @reserved: Reserved */
+       u32 reserved;
+       /** @hec_fw_error: CSC firmware error */
+       u32 hec_fw_error;
+} __packed;
+
+/**
+ * struct xe_ras_ieh_error - SoC IEH (Integrated Error Handler) error details
+ */
+struct xe_ras_ieh_error {
+       /** @ieh_instance: IEH instance */
+       u32 ieh_instance:2;
+       /** @reserved: Reserved for future use */
+       u32 reserved:30;
+       /** @global_error_status: Global error status */
+       u32 global_error_status;
+       /** @local_error_status: Local error status */
+       u32 local_error_status;
+       /** @gerr_mask: Global error mask */
+       u32 gerr_mask;
+       /** @info: Additional information */
+       u32 info[10];
+} __packed;
+
+/**
+ * struct xe_ras_memory_error - Device memory error details
+ */
+struct xe_ras_memory_error {
+       /** @category: Device memory error category */
+       u8 category;
+       /** @reserved: Reserved for future use */
+       u8 reserved[7];
+       /** @hardware_address: Hardware physical address details */
+       u64 hardware_address;
+       /** @sw_address: Software address where error occurred */
+       u64 sw_address;
+       /** @reserved1: Reserved */
+       u32 reserved1[10];
+} __packed;
+
+/**
+ * struct xe_ras_page_offline_list - Response from get offline list command
+ */
+struct xe_ras_page_offline_list {
+       /** @max_entries: Total no of pages that can be stored in flash */
+       u32 max_entries;
+       /** @total_pages: Total number of permanently offlined pages */
+       u32 total_pages;
+       /** @pages_returned: Number of pages returned in this response */
+       u32 pages_returned;
+       /** @page_addresses: Array of permanently offlined page addresses (4KB 
aligned) */
+       u64 page_addresses[XE_RAS_NUM_PAGES];
+       /** @additional_data: Indicates if more data is available */
+       u8 additional_data;
+       /** @reserved: Reserved for future use */
+       u8 reserved[3];
+} __packed;
+
+/**
+ * struct xe_ras_page_offline_queue - Response from get offline queue command
+ */
+struct xe_ras_page_offline_queue {
+       /** @total_pages: Total number of queued pages */
+       u32 total_pages;
+       /** @pages_returned: Number of pages returned in this response */
+       u32 pages_returned;
+       /** @page_addresses: Array of page addresses (4KB aligned) */
+       u64 page_addresses[XE_RAS_NUM_PAGES];
+       /** @additional_data: Indicates if more data is available */
+       u8 additional_data;
+       /** @reserved: Reserved for future use */
+       u8 reserved[3];
+} __packed;
+
+/**
+ * struct xe_ras_page_offline_request - Request for page offline command
+ *
+ * This structure provides the request format to offline/decline a page
+ */
+struct xe_ras_page_offline_request {
+       /** @page_address: Page address (4KB aligned) */
+       u64 page_address;
+       /** @action: Action to be performed, see &enum xe_ras_page_action */
+       u32 action;
+       /** @reserved: Reserved for future use */
+       u32 reserved;
+} __packed;
+
+/**
+ * struct xe_ras_page_offline_response - Response from page offline command
+ */
+struct xe_ras_page_offline_response {
+       /** @status: Status of the page offline request, see &enum 
xe_ras_response_status */
+       u32 status;
+       /** @reserved: Reserved for future use */
+       u32 reserved;
+} __packed;
 #endif
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c 
b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 427afd144f3a..4c506027fa94 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -54,7 +54,6 @@
  *     # cat /sys/bus/pci/devices/<device>/survivability_mode
  *       Boot
  *
- *
  * Any additional debug information if present will be visible under the 
directory
  * ``survivability_info``::
  *
@@ -98,6 +97,15 @@
  *     # cat /sys/bus/pci/devices/<device>/survivability_mode
  *       Runtime
  *
+ * On some CSC firmware errors, PCODE sets FDO mode and the only recovery 
possible is through
+ * firmware flash using SPI driver. Userspace can check if FDO mode is set by 
checking the below
+ * sysfs entry.
+ *
+ * .. code-block:: shell
+ *
+ *     # cat /sys/bus/pci/devices/<device>/survivability_info/fdo_mode
+ *       enabled
+ *
  * When such errors occur, userspace is notified with the drm device wedged 
uevent and runtime
  * survivability mode. User can then initiate a firmware flash using userspace 
tools like fwupd
  * to restore device to normal operation.
@@ -296,7 +304,8 @@ static int create_survivability_sysfs(struct pci_dev *pdev)
        if (ret)
                return ret;
 
-       if (check_boot_failure(xe)) {
+       /* Survivability info is not required if enabled via configfs */
+       if (!xe_configfs_get_survivability_mode(pdev)) {
                ret = devm_device_add_group(dev, &survivability_info_group);
                if (ret)
                        return ret;
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event.c 
b/drivers/gpu/drm/xe/xe_sysctrl_event.c
index b4d17329af6c..da395148ee9d 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_event.c
+++ b/drivers/gpu/drm/xe/xe_sysctrl_event.c
@@ -16,7 +16,7 @@ static void get_pending_event(struct xe_sysctrl *sc, struct 
xe_sysctrl_mailbox_c
 {
        struct xe_sysctrl_event_response *response = command->data_out;
        struct xe_device *xe = sc_to_xe(sc);
-       u32 count = XE_SYSCTRL_EVENT_FLOOD;
+       u32 count = XE_SYSCTRL_FLOOD_LIMIT;
        size_t len;
        int ret;
 
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h 
b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h
index c16c66b9fa7f..348768ca454a 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_event_types.h
+++ b/drivers/gpu/drm/xe/xe_sysctrl_event_types.h
@@ -10,9 +10,6 @@
 
 #define XE_SYSCTRL_EVENT_DATA_LEN              59
 
-/* Modify as needed */
-#define XE_SYSCTRL_EVENT_FLOOD                 16
-
 /**
  * enum xe_sysctrl_event - Events reported by System Controller
  *
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c 
b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c
index 3caa9f15875f..f49d8dabcf73 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c
+++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c
@@ -307,6 +307,34 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc)
        sc->phase_bit = (ctrl_reg & SYSCTRL_FRAME_PHASE) ? 1 : 0;
 }
 
+/**
+ * xe_sysctrl_create_command() - Create System controller command structure
+ * @command: Sysctrl command structure
+ * @group_id: Command group ID
+ * @cmd_id: Command ID
+ * @request: Pointer to request buffer (can be NULL)
+ * @request_len: Size of request buffer
+ * @response: Pointer to response buffer
+ * @response_len: Size of response buffer
+ *
+ * Helper function to create sysctrl command to be sent via 
xe_sysctrl_send_command()
+ */
+void xe_sysctrl_create_command(struct xe_sysctrl_mailbox_command *command, u8 
group_id, u8 cmd_id,
+                              void *request, size_t request_len, void 
*response,
+                              size_t response_len)
+{
+       struct xe_sysctrl_app_msg_hdr header = {0};
+
+       header.data = FIELD_PREP(APP_HDR_GROUP_ID_MASK, group_id) |
+                     FIELD_PREP(APP_HDR_COMMAND_MASK, cmd_id);
+
+       command->header = header;
+       command->data_in = request;
+       command->data_in_len = request_len;
+       command->data_out = response;
+       command->data_out_len = response_len;
+}
+
 /**
  * xe_sysctrl_send_command() - Send mailbox command to System Controller
  * @sc: System Controller instance
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h 
b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h
index f67e9234de48..0ba841b0be1b 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h
+++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h
@@ -27,5 +27,7 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc);
 int xe_sysctrl_send_command(struct xe_sysctrl *sc,
                            struct xe_sysctrl_mailbox_command *cmd,
                            size_t *rdata_len);
-
+void xe_sysctrl_create_command(struct xe_sysctrl_mailbox_command *command, u8 
group_id, u8 cmd_id,
+                              void *request, size_t request_len, void 
*response,
+                              size_t response_len);
 #endif
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h 
b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
index 84d7c647e743..f6cbb349c416 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
+++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
@@ -22,10 +22,18 @@ enum xe_sysctrl_group {
 /**
  * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group
  *
+ * @XE_SYSCTRL_CMD_GET_SOC_ERROR: Retrieve basic error information
  * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event
+ * @XE_SYSCTRL_CMD_PAGE_OFFLINE: Instruct firmware to offline/decline a page
+ * @XE_SYSCTRL_CMD_GET_OFFLINE_LIST: Retrieve list of all offlined pages from 
flash
+ * @XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE: Retrieve list of offlined queued pages 
from firmware
  */
 enum xe_sysctrl_gfsp_cmd {
+       XE_SYSCTRL_CMD_GET_SOC_ERROR            = 0x01,
        XE_SYSCTRL_CMD_GET_PENDING_EVENT        = 0x07,
+       XE_SYSCTRL_CMD_PAGE_OFFLINE             = 0x08,
+       XE_SYSCTRL_CMD_GET_OFFLINE_LIST         = 0x09,
+       XE_SYSCTRL_CMD_GET_OFFLINE_QUEUE        = 0x0A,
 };
 
 /**
@@ -48,6 +56,9 @@ struct xe_sysctrl_mailbox_command {
        size_t data_out_len;
 };
 
+/* Modify as needed */
+#define XE_SYSCTRL_FLOOD_LIMIT         16
+
 #define XE_SYSCTRL_MB_FRAME_SIZE       16
 #define XE_SYSCTRL_MB_MAX_FRAMES       64
 #define XE_SYSCTRL_MB_MAX_MESSAGE_SIZE \
-- 
2.34.1

[PATCH v7 1/6] Introduce Xe Uncorrectable Error Handling

Reply via email to