xe_drm_ras: Add support for drm ras

Riana Tauro Tue, 27 Jan 2026 22:51:45 -0800



On 1/20/2026 10:31 PM, Raag Jadav wrote:

On Mon, Jan 19, 2026 at 09:30:24AM +0530, Riana Tauro wrote:

Allocate correctable, uncorrectable nodes for every xe device
Each node contains error classes, counters and respective
query counter functions.

...

+static int hw_query_error_counter(struct xe_drm_ras_counter *info,
+                                 u32 error_id, const char **name, u32 *val)
+{
+       if (error_id < DRM_XE_RAS_ERROR_CLASS_GT || error_id >= 
DRM_XE_RAS_ERROR_CLASS_MAX)


This looks like it can be in_range().


in_range has start+len. Should again use count here.
This seems simpler

+               return -EINVAL;
+
+       if (!info[error_id].name)
+               return -ENOENT;
+
+       *name = info[error_id].name;
+       *val = atomic64_read(&info[error_id].counter);
+
+       return 0;
+}
+
+static int query_uncorrectable_error_counters(struct drm_ras_node *ep,


This is named as 'counters' but I only see a single call here. What am
I missing?


makes sense will fix it

+                                             u32 error_id, const char **name,
+                                             u32 *val)


Can this be less lines?


will check

+{
+       struct xe_device *xe = ep->priv;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = 
ras->info[DRM_XE_RAS_ERROR_SEVERITY_UNCORRECTABLE];
+
+       return hw_query_error_counter(info, error_id, name, val);
+}
+
+static int query_correctable_error_counters(struct drm_ras_node *ep,


Same as above.

+                                           u32 error_id, const char **name,
+                                           u32 *val)


Same as above.

+{
+       struct xe_device *xe = ep->priv;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = 
ras->info[DRM_XE_RAS_ERROR_SEVERITY_CORRECTABLE];
+
+       return hw_query_error_counter(info, error_id, name, val);
+}
+
+static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device 
*xe)
+{
+       struct xe_drm_ras_counter *counter;
+       int i;
+
+       counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERROR_CLASS_MAX,
+                              sizeof(struct xe_drm_ras_counter), GFP_KERNEL);


I'd make this robust against type changes, i.e. sizeof(*counter).

+       if (!counter)
+               return ERR_PTR(-ENOMEM);
+
+       for (i = 0; i < DRM_XE_RAS_ERROR_CLASS_MAX; i++) {
+               if (!errors[i])
+                       continue;
+
+               counter[i].name = errors[i];
+               atomic64_set(&counter[i].counter, 0);


Doesn't drmm_kcalloc() already take care of this?

+       }
+
+       return counter;
+}
+
+static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
+                             const enum drm_xe_ras_error_severity severity)
+{
+       struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+       struct xe_drm_ras *ras = &xe->ras;
+       const char *device_name;
+
+       device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
+                               pci_domain_nr(pdev->bus), pdev->bus->number,
+                               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+       node->device_name = device_name;
+       node->node_name = error_severity[severity];
+       node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
+       node->error_counter_range.first = DRM_XE_RAS_ERROR_CLASS_GT;
+       node->error_counter_range.last = DRM_XE_RAS_ERROR_CLASS_MAX - 1;
+       node->priv = xe;
+
+       ras->info[severity] = allocate_and_copy_counters(xe);
+       if (IS_ERR(ras->info[severity]))
+               return PTR_ERR(ras->info[severity]);
+
+       if (severity == DRM_XE_RAS_ERROR_SEVERITY_CORRECTABLE)
+               node->query_error_counter = query_correctable_error_counters;
+       else
+               node->query_error_counter = query_uncorrectable_error_counters;


Shouldn't this have explicit severity check, atleast for future proofing?

there are only two severity types right now. incase there is anythingelse added else can be modified accordingly

+
+       return 0;
+}
+
+static int register_nodes(struct xe_device *xe)
+{
+       struct xe_drm_ras *ras = &xe->ras;
+       int i;
+
+       for_each_error_severity(i) {
+               struct drm_ras_node *node = &ras->node[i];
+               int ret;
+
+               ret = assign_node_params(xe, node, i);
+               if (ret)
+                       return ret;
+
+               ret = drm_ras_node_register(node);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void xe_drm_ras_unregister_nodes(void *arg)
+{
+       struct xe_device *xe = arg;
+       struct xe_drm_ras *ras = &xe->ras;
+       int i;
+
+       for_each_error_severity(i) {
+               struct drm_ras_node *node = &ras->node[i];
+
+               drm_ras_node_unregister(node);
+
+               if (i == 0)
+                       kfree(node->device_name);


Aren't we allocating this for each node?


Thanks for catching this. The rev2 had this once per node.
I moved the entire node params to a different function.

Will fix this

+       }
+}
+
+/**
+ * xe_drm_ras_allocate_nodes - Allocate DRM RAS nodes
+ * @xe: xe device instance
+ *
+ * Allocate and register DRM RAS nodes per device
+ *
+ * Return: 0 on success, error code on failure
+ */
+int xe_drm_ras_allocate_nodes(struct xe_device *xe)
+{
+       struct xe_drm_ras *ras = &xe->ras;
+       struct drm_ras_node *node;
+       int err;
+
+       node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERROR_SEVERITY_MAX, 
sizeof(struct drm_ras_node),


Ditto for robust against type changes.


okay

+                           GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       ras->node = node;
+
+       err = register_nodes(xe);
+       if (err) {
+               drm_err(&xe->drm, "Failed to register drm ras node\n");
+               return err;
+       }
+
+       err = devm_add_action_or_reset(xe->drm.dev, 
xe_drm_ras_unregister_nodes, xe);
+       if (err) {
+               drm_err(&xe->drm, "Failed to add action for xe drm_ras\n");
+               return err;
+       }
+
+       return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
new file mode 100644
index 000000000000..2d714342e4e5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+#ifndef XE_DRM_RAS_H_
+#define XE_DRM_RAS_H_
+
+struct xe_device;
+
+#define for_each_error_severity(i)     \
+       for (i = 0; i < DRM_XE_RAS_ERROR_SEVERITY_MAX; i++)
+
+int xe_drm_ras_allocate_nodes(struct xe_device *xe);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h 
b/drivers/gpu/drm/xe/xe_drm_ras_types.h
new file mode 100644
index 000000000000..528c708e57da
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_DRM_RAS_TYPES_H_
+#define _XE_DRM_RAS_TYPES_H_
+
+#include <drm/xe_drm.h>
+#include <linux/atomic.h>
+
+struct drm_ras_node;
+
+/* Error categories reported by hardware */
+enum hardware_error {
+       HARDWARE_ERROR_CORRECTABLE = 0,
+       HARDWARE_ERROR_NONFATAL = 1,
+       HARDWARE_ERROR_FATAL = 2,
+       HARDWARE_ERROR_MAX,
+};
+
+/**
+ * struct xe_drm_ras_counter - XE RAS counter
+ *
+ * This structure contains error class and counter information
+ */
+struct xe_drm_ras_counter {
+       /** @name: error class name */
+       const char *name;
+
+       /** @counter: count of error */
+       atomic64_t counter;
+};
+
+/**
+ * struct xe_drm_ras - XE DRM RAS structure
+ *
+ * This structure has details of error counters
+ */
+struct xe_drm_ras {
+       /** @node: DRM RAS node */
+       struct drm_ras_node *node;
+
+       /** @info: info array for all types of errors */
+       struct xe_drm_ras_counter *info[DRM_XE_RAS_ERROR_SEVERITY_MAX];
+


Nit: Redundant blank line.

+};
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 8c65291f36fc..b42495d3015a 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -10,20 +10,14 @@
  #include "regs/xe_irq_regs.h"

#include "xe_device.h"

+#include "xe_drm_ras.h"
  #include "xe_hw_error.h"
  #include "xe_mmio.h"
  #include "xe_survivability_mode.h"

#define HEC_UNCORR_FW_ERR_BITS 4

  extern struct fault_attr inject_csc_hw_error;
-
-/* Error categories reported by hardware */
-enum hardware_error {
-       HARDWARE_ERROR_CORRECTABLE = 0,
-       HARDWARE_ERROR_NONFATAL = 1,
-       HARDWARE_ERROR_FATAL = 2,
-       HARDWARE_ERROR_MAX,
-};
+static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;


This is unrelated to uapi changes, shouldn't we split this into a separate
patch?


Let me check if i can split this

...

+/**
+ * enum drm_xe_ras_error_severity - DRM RAS error severity.
+ */
+enum drm_xe_ras_error_severity {
+       /** @DRM_XE_RAS_ERROR_SEVERITY_CORRECTABLE: Correctable Error */
+       DRM_XE_RAS_ERROR_SEVERITY_CORRECTABLE = 0,


DRM_XE_RAS_ERR_SEV_*? (and same for this entire file)


ERROR_SEVERITY is more verbose

+       /** @DRM_XE_RAS_ERROR_UNCORRECTABLE: Uncorrectable Error */


Match with actual name.


will fix this.

+       DRM_XE_RAS_ERROR_SEVERITY_UNCORRECTABLE,
+       /** @DRM_XE_RAS_ERROR_SEVERITY_MAX: Max severity */
+       DRM_XE_RAS_ERROR_SEVERITY_MAX /* non-ABI */
+};
+
+/**
+ * enum drm_xe_ras_error_class - DRM RAS error classes.
+ */
+enum drm_xe_ras_error_class {
+       /** @DRM_XE_RAS_ERROR_CLASS_GT: GT Error */
+       DRM_XE_RAS_ERROR_CLASS_GT = 1,
+       /** @DRM_XE_RAS_ERROR_CLASS_SOC: SoC Error */
+       DRM_XE_RAS_ERROR_CLASS_SOC,
+       /** @DRM_XE_RAS_ERROR_CLASS_MAX: Max Error */
+       DRM_XE_RAS_ERROR_CLASS_MAX      /* non-ABI */


I don't find 'CLASS' to be much translatable since it can inherently mean
anything, but I'm not sure if this to match with spec naming.

PS: I've used 'COMP' for component in my series[1], but upto you.
Also, please help review it in case I've missed anything.


It's an aggregated error class.

yeah component does match the spec. Also the rest of the errors will berenamed accordingly to.


core-compute and soc-internal

Thanks
Riana


[1] 
https://lore.kernel.org/intel-xe/[email protected]/

Raag

+};
+
+/*
+ * Error severity to name mapping.
+ */
+#define DRM_XE_RAS_ERROR_SEVERITY_NAMES {                                      
\
+       [DRM_XE_RAS_ERROR_SEVERITY_CORRECTABLE] = "correctable-errors",         
      \
+       [DRM_XE_RAS_ERROR_SEVERITY_UNCORRECTABLE] = "uncorrectable-errors",   \
+}
+
+/*
+ * Error class to name mapping.
+ */
+#define DRM_XE_RAS_ERROR_CLASS_NAMES {                                 \
+       [DRM_XE_RAS_ERROR_CLASS_GT] = "GT",                           \
+       [DRM_XE_RAS_ERROR_CLASS_SOC] = "SoC"                          \
+}
+
  #if defined(__cplusplus)
  }
  #endif
--
2.47.1

Re: [PATCH v4 2/4] drm/xe/xe_drm_ras: Add support for drm ras

Reply via email to