Add error-event support in XE drm_ras to notify userspace
whenever a GT or SoC error occurs.
$ sudo ./tools/net/ynl/pyynl/cli.py --family drm_ras --output-json \
--subscribe error-notify
{
"name": "error-event",
"msg": {
"device-name": "0000:03:00.0",
"node-id": 1,
"node-name": "uncorrectable-errors",
"error-id": 1,
"error-name": "core-compute",
"error-value": 1
}
}
Signed-off-by: Riana Tauro <[email protected]>
---
drivers/gpu/drm/xe/xe_drm_ras.c | 29 +++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_drm_ras.h | 6 ++++++
drivers/gpu/drm/xe/xe_hw_error.c | 7 ++++++-
3 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index c21c8b428de6..68d9f561d61d 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -181,6 +181,35 @@ static void xe_drm_ras_unregister_nodes(struct drm_device
*device, void *arg)
}
}
+/**
+ * xe_drm_ras_event() - Notify userspace of an error event
+ * @xe: xe device structure
+ * @error_id: error id
+ * @severity: error severity
+ * @flags: flags for allocation
+ *
+ * Notifies userspace of an error.
+ */
+void xe_drm_ras_event(struct xe_device *xe, u32 error_id,
+ enum drm_xe_ras_error_severity severity, gfp_t flags)
+{
+ struct xe_drm_ras *ras = &xe->ras;
+ struct drm_ras_node *node = &ras->node[severity];
+ struct xe_drm_ras_counter *info = ras->info[severity];
+ u32 value;
+ int ret;
+
+ if (!info || !info[error_id].name)
+ return;
+
+ value = atomic_read(&info[error_id].counter);
+
+ ret = drm_ras_nl_error_event(node, error_id, info[error_id].name,
value, flags);
+ if (ret)
+ drm_err(&xe->drm, "Failed to send RAS error event for error %s:
%d\n",
+ info[error_id].name, ret);
+}
+
/**
* xe_drm_ras_init() - Initialize DRM RAS
* @xe: xe device instance
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
index 365c70e93e82..b9b8b541d591 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.h
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -5,11 +5,17 @@
#ifndef _XE_DRM_RAS_H_
#define _XE_DRM_RAS_H_
+#include <linux/types.h>
+
+#include <drm/xe_drm.h>
+
struct xe_device;
#define for_each_error_severity(i) \
for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++)
int xe_drm_ras_init(struct xe_device *xe);
+void xe_drm_ras_event(struct xe_device *xe, u32 error_id,
+ enum drm_xe_ras_error_severity severity, gfp_t flags);
#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 5135e8e4093f..c0fda18601cd 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -279,7 +279,7 @@ static void gt_hw_error_handler(struct xe_tile *tile, const
enum hardware_error
if (hw_err == HARDWARE_ERROR_NONFATAL) {
atomic_inc(&info[error_id].counter);
log_hw_error(tile, info[error_id].name, severity);
- return;
+ goto notify;
}
for (i = 0; i < PVC_GT_VECTOR_LEN(hw_err); i++) {
@@ -336,6 +336,9 @@ static void gt_hw_error_handler(struct xe_tile *tile, const
enum hardware_error
xe_mmio_write32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i),
vector);
}
+
+notify:
+ xe_drm_ras_event(xe, error_id, severity, GFP_ATOMIC);
}
static void soc_slave_ieh_handler(struct xe_tile *tile, const enum
hardware_error hw_err, u32 error_id)
@@ -418,6 +421,8 @@ static void soc_hw_error_handler(struct xe_tile *tile,
const enum hardware_error
xe_mmio_write32(mmio, SOC_GLOBAL_ERR_STAT_REG(master, hw_err),
master_global_errstat);
+ xe_drm_ras_event(xe, error_id, severity, GFP_ATOMIC);
+
unmask_gsysevtctl:
for (i = 0; i < XE_SOC_NUM_IEH; i++)
xe_mmio_write32(mmio, SOC_GSYSEVTCTL_REG(master, slave, i),
--
2.47.1