On Mon, May 11, 2026 at 10:40:29PM +0530, Tauro, Riana wrote:
> On 4/18/2026 2:46 AM, Raag Jadav wrote:
> > System controller allows programming per error threshold value, which
> > it uses to raise error events to the driver. Get it using mailbox
> > command so that it can be exposed to the user.
> >
> > Signed-off-by: Raag Jadav<[email protected]>
> > ---
> > drivers/gpu/drm/xe/xe_ras.c | 73 +++++++++++++++++++
> > drivers/gpu/drm/xe/xe_ras.h | 3 +
> > drivers/gpu/drm/xe/xe_ras_types.h | 22 ++++++
> > drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h | 2 +
> > 4 files changed, 100 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
> > index 08e91348c459..3e93f838aa4a 100644
> > --- a/drivers/gpu/drm/xe/xe_ras.c
> > +++ b/drivers/gpu/drm/xe/xe_ras.c
> > @@ -3,11 +3,14 @@
> > * Copyright © 2026 Intel Corporation
> > */
> > +#include "xe_pm.h"
> > #include "xe_printk.h"
> > #include "xe_ras.h"
> > #include "xe_ras_types.h"
> > #include "xe_sysctrl.h"
> > #include "xe_sysctrl_event_types.h"
> > +#include "xe_sysctrl_mailbox.h"
> > +#include "xe_sysctrl_mailbox_types.h"
> > /* Severity of detected errors */
> > enum xe_ras_severity {
> > @@ -49,6 +52,23 @@ static const char *const xe_ras_components[] = {
> > };
> > static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX);
> > +/* uAPI mapping */
> > +static const int drm_to_xe_ras_components[] = {
> > + [DRM_XE_RAS_ERR_COMP_CORE_COMPUTE] = XE_RAS_COMP_CORE_COMPUTE,
> > + [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = XE_RAS_COMP_SOC_INTERNAL,
> > + [DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY] = XE_RAS_COMP_DEVICE_MEMORY,
> > + [DRM_XE_RAS_ERR_COMP_PCIE] = XE_RAS_COMP_PCIE,
> > + [DRM_XE_RAS_ERR_COMP_FABRIC] = XE_RAS_COMP_FABRIC
> > +};
> > +static_assert(ARRAY_SIZE(drm_to_xe_ras_components) ==
> > DRM_XE_RAS_ERR_COMP_MAX);
> > +
> > +/* uAPI mapping */
> > +static const int drm_to_xe_ras_severities[] = {
> > + [DRM_XE_RAS_ERR_SEV_CORRECTABLE] = XE_RAS_SEV_CORRECTABLE,
> > + [DRM_XE_RAS_ERR_SEV_UNCORRECTABLE] = XE_RAS_SEV_UNCORRECTABLE
> > +};
> > +static_assert(ARRAY_SIZE(drm_to_xe_ras_severities) ==
> > DRM_XE_RAS_ERR_SEV_MAX);
> > +
> > static inline const char *sev_to_str(u8 sev)
> > {
> > if (sev >= XE_RAS_SEV_MAX)
> > @@ -90,3 +110,56 @@ void xe_ras_counter_threshold_crossed(struct xe_device
> > *xe,
> > comp_to_str(component), sev_to_str(severity));
> > }
> > }
> > +
> > +static void ras_command_prepare(struct xe_sysctrl_mailbox_command *command,
> > + void *request, size_t request_len, void
> > *response,
> > + size_t response_len, u8 hdr_cmd)
> > +{
> > + struct xe_sysctrl_app_msg_hdr header = {};
> > +
> > + header.data = REG_FIELD_PREP(APP_HDR_GROUP_ID_MASK,
> > XE_SYSCTRL_GROUP_GFSP) |
> > + REG_FIELD_PREP(APP_HDR_COMMAND_MASK, hdr_cmd);
> > +
> > + command->header = header;
> > + command->data_in = request;
> > + command->data_in_len = request_len;
> > + command->data_out = response;
> > + command->data_out_len = response_len;
> > +}
> > +
> > +int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32
> > component, u32 *threshold)
> > +{
> > + struct xe_ras_get_threshold_response response = {};
> > + struct xe_ras_get_threshold_request request = {};
> > + struct xe_sysctrl_mailbox_command command = {};
> > + struct xe_ras_error_class counter = {};
> > + size_t len;
> > + int ret;
> > +
> > + counter.common.severity = drm_to_xe_ras_severities[severity];
> > + counter.common.component = drm_to_xe_ras_components[component];
>
> I see this is only for correctable errors. We do not have correctable memory
> errors
> Do we want to return -EOPNOTSUPP for memory errors?
Could be done, but I'm expecting it to come from firmware since driver
is more or less acting as a transport here.
Raag
> > + request.counter = counter;
> > +
> > + ras_command_prepare(&command, &request, sizeof(request), &response,
> > + sizeof(response), XE_SYSCTRL_CMD_GET_THRESHOLD);
> > +
> > + guard(xe_pm_runtime)(xe);
> > + ret = xe_sysctrl_send_command(&xe->sc, &command, &len);
> > + if (ret) {
> > + xe_err(xe, "sysctrl: failed to get threshold %d\n", ret);
> > + return ret;
> > + }
> > +
> > + if (len != sizeof(response)) {
> > + xe_err(xe, "sysctrl: unexpected get threshold response length
> > %zu (expected %zu)\n",
> > + len, sizeof(response));
> > + return -EIO;
> > + }
> > +
> > + counter = response.counter;
>
> Do we expect this to change?
Nope, but it helps with wrapping below.
> > + *threshold = response.threshold;
> > +
> > + xe_dbg(xe, "[RAS]: Get threshold %u for %s %s\n", response.threshold,
> > + comp_to_str(counter.common.component),
> > sev_to_str(counter.common.severity));
>
> Do we need this. it should be visible to the user via netlink
It's for us, not the user.
> > + return 0;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
> > index ea90593b62dc..982bbe61461e 100644
> > --- a/drivers/gpu/drm/xe/xe_ras.h
> > +++ b/drivers/gpu/drm/xe/xe_ras.h
> > @@ -6,10 +6,13 @@
> > #ifndef _XE_RAS_H_
> > #define _XE_RAS_H_
> > +#include <linux/types.h>
> > +
> > struct xe_device;
> > struct xe_sysctrl_event_response;
> > void xe_ras_counter_threshold_crossed(struct xe_device *xe,
> > struct xe_sysctrl_event_response
> > *response);
> > +int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32
> > component, u32 *threshold);
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_ras_types.h
> > b/drivers/gpu/drm/xe/xe_ras_types.h
> > index 4e63c67f806a..d5da93d65cf5 100644
> > --- a/drivers/gpu/drm/xe/xe_ras_types.h
> > +++ b/drivers/gpu/drm/xe/xe_ras_types.h
> > @@ -70,4 +70,26 @@ struct xe_ras_threshold_crossed {
> > struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS];
> > } __packed;
> > +/**
> > + * struct xe_ras_get_threshold_request - Request structure for get
> > threshold
> > + */
> > +struct xe_ras_get_threshold_request {
> > + /** @counter: Counter to get threshold for */
> > + struct xe_ras_error_class counter;
> > + /** @reserved: Reserved for future use */
> > + u32 reserved;
> > +} __packed;
> > +
> > +/**
> > + * struct xe_ras_get_threshold_response - Response structure for get
> > threshold
> > + */
> > +struct xe_ras_get_threshold_response {
> > + /** @counter: Counter id */
>
> Nit: ID
Sure.
Raag
> > + struct xe_ras_error_class counter;
> > + /** @threshold: Threshold value */
> > + u32 threshold;
> > + /** @reserved: Reserved for future use */
> > + u32 reserved[4];
> > +} __packed;
> > +
> > #endif
> > diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
> > b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
> > index 84d7c647e743..a1b71218deca 100644
> > --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
> > @@ -22,9 +22,11 @@ enum xe_sysctrl_group {
> > /**
> > * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group
> > *
> > + * @XE_SYSCTRL_CMD_GET_THRESHOLD: Retrieve error threshold
> > * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event
> > */
> > enum xe_sysctrl_gfsp_cmd {
> > + XE_SYSCTRL_CMD_GET_THRESHOLD = 0x05,
> > XE_SYSCTRL_CMD_GET_PENDING_EVENT = 0x07,
> > };