Add a debugfs interface to manually trigger the critical error handler
for testing cold reset recovery paths. This is useful for validating
the error recovery mechanism.

The new debugfs entry 'trigger_critical_error' is located at:
  /sys/kernel/debug/dri/N/trigger_critical_error

Reading the file displays usage instructions. Writing '1' invokes
xe_critical_error_handler(), which marks the device as wedged with
DRM_WEDGE_RECOVERY_COLD_RESET method and sends a uevent to userspace
indicating that a complete device power cycle is required for recovery.

Writing '0' or any other false value has no effect.

This interface is intended for development, testing, and validation
of critical error recovery code.

Signed-off-by: Mallesh Koujalagi <[email protected]>
---
 drivers/gpu/drm/xe/xe_debugfs.c | 38 +++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index 844cfafe1ec7..61c76e5e617e 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -18,6 +18,7 @@
 #include "xe_gt_debugfs.h"
 #include "xe_gt_printk.h"
 #include "xe_guc_ads.h"
+#include "xe_hw_error.h"
 #include "xe_mmio.h"
 #include "xe_pm.h"
 #include "xe_psmi.h"
@@ -509,6 +510,40 @@ static const struct file_operations 
disable_late_binding_fops = {
        .write = disable_late_binding_set,
 };
 
+static ssize_t trigger_critical_error_show(struct file *f, char __user *ubuf,
+                                          size_t size, loff_t *pos)
+{
+       const char *msg = "Write 1 to trigger critical error handler\n";
+
+       return simple_read_from_buffer(ubuf, size, pos, msg, strlen(msg));
+}
+
+static ssize_t trigger_critical_error_set(struct file *f,
+                                         const char __user *ubuf,
+                                         size_t size, loff_t *pos)
+{
+       struct xe_device *xe = file_inode(f)->i_private;
+       bool trigger;
+       ssize_t ret;
+
+       ret = kstrtobool_from_user(ubuf, size, &trigger);
+       if (ret)
+               return ret;
+
+       if (trigger) {
+               xe_critical_error_handler(xe);
+               drm_info(&xe->drm, "Critical error handler triggered via 
debugfs\n");
+       }
+
+       return size;
+}
+
+static const struct file_operations trigger_critical_error_fops = {
+       .owner = THIS_MODULE,
+       .read = trigger_critical_error_show,
+       .write = trigger_critical_error_set,
+};
+
 void xe_debugfs_register(struct xe_device *xe)
 {
        struct ttm_device *bdev = &xe->ttm;
@@ -550,6 +585,9 @@ void xe_debugfs_register(struct xe_device *xe)
        debugfs_create_file("disable_late_binding", 0600, root, xe,
                            &disable_late_binding_fops);
 
+       debugfs_create_file("trigger_critical_error", 0600, root, xe,
+                           &trigger_critical_error_fops);
+
        /*
         * Don't expose page reclaim configuration file if not supported by the
         * hardware initially.
-- 
2.34.1

Reply via email to