Add a debugfs interface to manually trigger the critical error handler for testing cold reset recovery paths. This is useful for validating the error recovery mechanism.
The new debugfs entry 'trigger_critical_error' is located at: /sys/kernel/debug/dri/N/trigger_critical_error Reading the file displays usage instructions. Writing '1' invokes xe_critical_error_handler(), which marks the device as wedged with DRM_WEDGE_RECOVERY_COLD_RESET method and sends a uevent to userspace indicating that a complete device power cycle is required for recovery. Writing '0' or any other false value has no effect. This interface is intended for development, testing, and validation of critical error recovery code. Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/xe_debugfs.c | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c index 844cfafe1ec7..61c76e5e617e 100644 --- a/drivers/gpu/drm/xe/xe_debugfs.c +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -18,6 +18,7 @@ #include "xe_gt_debugfs.h" #include "xe_gt_printk.h" #include "xe_guc_ads.h" +#include "xe_hw_error.h" #include "xe_mmio.h" #include "xe_pm.h" #include "xe_psmi.h" @@ -509,6 +510,40 @@ static const struct file_operations disable_late_binding_fops = { .write = disable_late_binding_set, }; +static ssize_t trigger_critical_error_show(struct file *f, char __user *ubuf, + size_t size, loff_t *pos) +{ + const char *msg = "Write 1 to trigger critical error handler\n"; + + return simple_read_from_buffer(ubuf, size, pos, msg, strlen(msg)); +} + +static ssize_t trigger_critical_error_set(struct file *f, + const char __user *ubuf, + size_t size, loff_t *pos) +{ + struct xe_device *xe = file_inode(f)->i_private; + bool trigger; + ssize_t ret; + + ret = kstrtobool_from_user(ubuf, size, &trigger); + if (ret) + return ret; + + if (trigger) { + xe_critical_error_handler(xe); + drm_info(&xe->drm, "Critical error handler triggered via debugfs\n"); + } + + return size; +} + +static const struct file_operations trigger_critical_error_fops = { + .owner = THIS_MODULE, + .read = trigger_critical_error_show, + .write = trigger_critical_error_set, +}; + void xe_debugfs_register(struct xe_device *xe) { struct ttm_device *bdev = &xe->ttm; @@ -550,6 +585,9 @@ void xe_debugfs_register(struct xe_device *xe) debugfs_create_file("disable_late_binding", 0600, root, xe, &disable_late_binding_fops); + debugfs_create_file("trigger_critical_error", 0600, root, xe, + &trigger_critical_error_fops); + /* * Don't expose page reclaim configuration file if not supported by the * hardware initially. -- 2.34.1
