Add error-event support in XE DRM RAS to notify userspace
whenever a GT or SoC error occurs.
$ sudo ynl --family drm_ras --subscribe error-notify
{'msg': {'error-id': 1, 'node-id': 1}, 'name': 'error-event'}
Signed-off-by: Riana Tauro <[email protected]>
---
drivers/gpu/drm/xe/xe_drm_ras.c | 17 +++++++++++++++++
drivers/gpu/drm/xe/xe_drm_ras.h | 7 +++++++
drivers/gpu/drm/xe/xe_hw_error.c | 5 +++++
3 files changed, 29 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index c21c8b428de6..47c040c80175 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -181,6 +181,23 @@ static void xe_drm_ras_unregister_nodes(struct drm_device
*device, void *arg)
}
}
+/**
+ * xe_drm_ras_notify() - Notify userspace of an error event
+ * @ras: ras structure
+ * @error_id: error id
+ * @severity: error severity
+ * @flags: flags for allocation
+ *
+ * Notifies userspace of an error.
+ */
+void xe_drm_ras_notify(struct xe_drm_ras *ras, u32 error_id,
+ const enum drm_xe_ras_error_severity severity, gfp_t
flags)
+{
+ struct drm_ras_node *node = &ras->node[severity];
+
+ drm_ras_error_notify(node, error_id, flags);
+}
+
/**
* xe_drm_ras_init() - Initialize DRM RAS
* @xe: xe device instance
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
index 5cc8f0124411..ac347d0d63eb 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.h
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -5,11 +5,18 @@
#ifndef XE_DRM_RAS_H_
#define XE_DRM_RAS_H_
+#include <linux/types.h>
+
+#include <drm/xe_drm.h>
+
struct xe_device;
+struct xe_drm_ras;
#define for_each_error_severity(i) \
for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++)
int xe_drm_ras_init(struct xe_device *xe);
+void xe_drm_ras_notify(struct xe_drm_ras *ras, u32 error_id,
+ const enum drm_xe_ras_error_severity severity, gfp_t
flags);
#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 2a31b430570e..17424e07e72c 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -332,6 +332,8 @@ static void gt_hw_error_handler(struct xe_tile *tile, const
enum hardware_error
xe_mmio_write32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i),
vector);
}
+
+ xe_drm_ras_notify(ras, error_id, severity, GFP_ATOMIC);
}
static void soc_slave_ieh_handler(struct xe_tile *tile, const enum
hardware_error hw_err, u32 error_id)
@@ -368,6 +370,7 @@ static void soc_hw_error_handler(struct xe_tile *tile,
const enum hardware_error
{
const enum drm_xe_ras_error_severity severity =
hw_err_to_severity(hw_err);
struct xe_device *xe = tile_to_xe(tile);
+ struct xe_drm_ras *ras = &xe->ras;
struct xe_mmio *mmio = &tile->mmio;
unsigned long master_global_errstat, master_local_errstat;
u32 master, slave, regbit;
@@ -418,6 +421,8 @@ static void soc_hw_error_handler(struct xe_tile *tile,
const enum hardware_error
for (i = 0; i < XE_SOC_NUM_IEH; i++)
xe_mmio_write32(mmio, SOC_GSYSEVTCTL_REG(master, slave, i),
(HARDWARE_ERROR_MAX << 1) + 1);
+
+ xe_drm_ras_notify(ras, error_id, severity, GFP_ATOMIC);
}
static void hw_error_source_handler(struct xe_tile *tile, const enum
hardware_error hw_err)
--
2.47.1