When PUNIT (power management unit) errors are detected that persist across warm resets, mark the device as wedged with DRM_WEDGE_RECOVERY_COLD_RESET and notify userspace that a complete device power cycle is required to restore normal operation.
Signed-off-by: Mallesh Koujalagi <[email protected]> --- v3: - Use PUNIT instead of PMU. (Riana) - Use consistent wordingi. - Remove log. (Raag) v4: - Make function static. (Raag) v5: - Remove kdoc for static function. (Raag) - Remove xe_ prefix for static function. --- drivers/gpu/drm/xe/xe_ras.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index d79f8a6589ac..604470565bf3 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -4,6 +4,8 @@ */ #include "xe_bo.h" +#include "xe_assert.h" +#include "xe_device_types.h" #include "xe_device.h" #include "xe_printk.h" #include "xe_ras.h" @@ -222,6 +224,12 @@ static enum xe_ras_recovery_action handle_core_compute_errors(struct xe_device * return XE_RAS_RECOVERY_ACTION_RECOVERED; } +static void punit_error_handler(struct xe_device *xe) +{ + xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_COLD_RESET); + xe_device_declare_wedged(xe); +} + static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device *xe, struct xe_ras_error_array *arr) { @@ -265,7 +273,7 @@ static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device * xe_err(xe, "[RAS]: PUNIT %s detected: 0x%x\n", sev_to_str(error_class->common.severity), ieh_error->global_error_status); - /* TODO: Add PUNIT error handling */ + punit_error_handler(xe); return XE_RAS_RECOVERY_ACTION_DISCONNECT; } } -- 2.34.1
