xe_hw_error: Add support for Core-Compute errors

Raag Jadav Thu, 05 Feb 2026 07:50:11 -0800

On Mon, Feb 02, 2026 at 12:14:00PM +0530, Riana Tauro wrote:
> PVC supports GT error reporting via vector registers along with
> error status register. Add support to report these errors and
> update respective counters. Incase of Subslice error reported
> by vector register, process the error status register
> for applicable bits.
> 
> The counter is embedded in the xe drm ras structure and is
> exposed to the userspace using the drm_ras generic netlink
> interface.
> 
> $ sudo ynl --family drm_ras --do query-error-counter  --json


We usually add '\' at the end for wrapping commands so that they're easy
to apply directly (and same for all other patches where applicable).

>   '{"node-id":0, "error-id":1}'

Ditto.

> {'error-id': 1, 'error-name': 'core-compute', 'error-value': 0}
> 
> Co-developed-by: Himal Prasad Ghimiray <[email protected]>
> Signed-off-by: Himal Prasad Ghimiray <[email protected]>
> Signed-off-by: Riana Tauro <[email protected]>
> ---
> v2: Add ID's and names as uAPI (Rodrigo)
>     Add documentation
>     Modify commit message
> 
> v3: remove 'error' from counters
>     use drmm_kcalloc
>     add a for_each for severity
>     differentitate error classes and severity in UAPI(Raag)
>     Use correctable and uncorrectable in uapi (Pratik / Aravind)
> 
> v4: modify enums in UAPI
>     improve comments
>     add bounds check in handler
>     add error mask macro (Raag)
>     use atomic_t
>     add null pointer checks
> ---
>  drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  62 ++++++-
>  drivers/gpu/drm/xe/xe_hw_error.c           | 199 +++++++++++++++++++--
>  2 files changed, 241 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h 
> b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> index c146b9ef44eb..17982a335941 100644
> --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> @@ -6,15 +6,59 @@
>  #ifndef _XE_HW_ERROR_REGS_H_
>  #define _XE_HW_ERROR_REGS_H_
>  
> -#define HEC_UNCORR_ERR_STATUS(base)                    XE_REG((base) + 0x118)
> -#define    UNCORR_FW_REPORTED_ERR                      BIT(6)
> +#define HEC_UNCORR_ERR_STATUS(base)                  XE_REG((base) + 0x118)
> +#define   UNCORR_FW_REPORTED_ERR                     REG_BIT(6)
>  
> -#define HEC_UNCORR_FW_ERR_DW0(base)                    XE_REG((base) + 0x124)
> +#define HEC_UNCORR_FW_ERR_DW0(base)                  XE_REG((base) + 0x124)
> +
> +#define ERR_STAT_GT_COR                                      0x100160
> +#define   EU_GRF_COR_ERR                             REG_BIT(15)
> +#define   EU_IC_COR_ERR                                      REG_BIT(14)
> +#define   SLM_COR_ERR                                        REG_BIT(13)
> +#define   GUC_COR_ERR                                        REG_BIT(1)
> +
> +#define ERR_STAT_GT_NONFATAL                         0x100164
> +#define ERR_STAT_GT_FATAL                            0x100168
> +#define   EU_GRF_FAT_ERR                             REG_BIT(15)
> +#define   SLM_FAT_ERR                                        REG_BIT(13)
> +#define   GUC_FAT_ERR                                        REG_BIT(6)
> +#define   FPU_FAT_ERR                                        REG_BIT(3)
> +
> +#define ERR_STAT_GT_REG(x)                           XE_REG(_PICK_EVEN((x), \
> +                                                                       
> ERR_STAT_GT_COR, \
> +                                                                       
> ERR_STAT_GT_NONFATAL))
> +
> +#define PVC_COR_ERR_MASK                             (GUC_COR_ERR | 
> SLM_COR_ERR | \
> +                                                      EU_IC_COR_ERR | 
> EU_GRF_COR_ERR)
> +
> +#define PVC_FAT_ERR_MASK                             (FPU_FAT_ERR | 
> GUC_FAT_ERR | \
> +                                                     EU_GRF_FAT_ERR | 
> SLM_FAT_ERR)

Nit: Whitespace please!

> +#define DEV_ERR_STAT_NONFATAL                                0x100178
> +#define DEV_ERR_STAT_CORRECTABLE                     0x10017c
> +#define DEV_ERR_STAT_REG(x)                          XE_REG(_PICK_EVEN((x), \
> +                                                                       
> DEV_ERR_STAT_CORRECTABLE, \
> +                                                                       
> DEV_ERR_STAT_NONFATAL))

I know it was already like this but how does this evaluate for FATAL?

> +#define   XE_CSC_ERROR                                       17
> +#define   XE_GT_ERROR                                        0
> +
> +#define ERR_STAT_GT_FATAL_VECTOR_0                   0x100260
> +#define ERR_STAT_GT_FATAL_VECTOR_1                   0x100264
> +
> +#define ERR_STAT_GT_FATAL_VECTOR_REG(x)                      
> XE_REG(_PICK_EVEN((x), \
> +                                                               
> ERR_STAT_GT_FATAL_VECTOR_0, \
> +                                                               
> ERR_STAT_GT_FATAL_VECTOR_1))
> +
> +#define ERR_STAT_GT_COR_VECTOR_0                     0x1002a0
> +#define ERR_STAT_GT_COR_VECTOR_1                     0x1002a4
> +
> +#define ERR_STAT_GT_COR_VECTOR_REG(x)                        
> XE_REG(_PICK_EVEN((x), \
> +                                                                       
> ERR_STAT_GT_COR_VECTOR_0, \
> +                                                                       
> ERR_STAT_GT_COR_VECTOR_1))
> +
> +#define ERR_STAT_GT_VECTOR_REG(hw_err, x)            (hw_err == 
> HARDWARE_ERROR_CORRECTABLE ? \
> +                                                     
> ERR_STAT_GT_COR_VECTOR_REG(x) : \
> +                                                     
> ERR_STAT_GT_FATAL_VECTOR_REG(x))

Ditto for whitespace.

> -#define DEV_ERR_STAT_NONFATAL                        0x100178
> -#define DEV_ERR_STAT_CORRECTABLE             0x10017c
> -#define DEV_ERR_STAT_REG(x)                  XE_REG(_PICK_EVEN((x), \
> -                                                               
> DEV_ERR_STAT_CORRECTABLE, \
> -                                                               
> DEV_ERR_STAT_NONFATAL))
> -#define   XE_CSC_ERROR                               BIT(17)
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c 
> b/drivers/gpu/drm/xe/xe_hw_error.c
> index 2019aaaa1ebe..ff31fb322c8a 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -3,6 +3,7 @@
>   * Copyright © 2025 Intel Corporation
>   */
>  
> +#include <linux/bitmap.h>
>  #include <linux/fault-inject.h>
>  
>  #include "regs/xe_gsc_regs.h"
> @@ -15,7 +16,13 @@
>  #include "xe_mmio.h"
>  #include "xe_survivability_mode.h"
>  
> -#define  HEC_UNCORR_FW_ERR_BITS 4
> +#define  GT_HW_ERROR_MAX_ERR_BITS    16
> +#define  HEC_UNCORR_FW_ERR_BITS              4
> +#define  XE_RAS_REG_SIZE             32
> +
> +#define  PVC_ERROR_MASK_SET(hw_err, err_bit) \
> +     ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? (BIT(err_bit) & 
> PVC_COR_ERR_MASK) : \
> +     (BIT(err_bit) & PVC_FAT_ERR_MASK))

I'd write this as below and move it to xe_hw_error_regs.h

#define PVC_COR_ERR_MASK_SET(err_bit)                   (PVC_COR_ERR_MASK & 
REG_BIT(err_bit))
#define PVC_FAT_ERR_MASK_SET(err_bit)                   (PVC_FAT_ERR_MASK & 
REG_BIT(err_bit))

#define PVC_ERR_MASK_SET(hw_err, err_bit)               ((hw_err == 
HARDWARE_ERROR_CORRECTABLE) ? \
                                                                
PVC_COR_ERR_MASK_SET(err_bit) : \
                                                                
PVC_FAT_ERR_MASK_SET(err_bit)

...

> +static void gt_hw_error_handler(struct xe_tile *tile, const enum 
> hardware_error hw_err,
> +                             u32 error_id)
> +{
> +     const enum drm_xe_ras_error_severity severity = 
> hw_err_to_severity(hw_err);
> +     struct xe_device *xe = tile_to_xe(tile);
> +     struct xe_drm_ras *ras = &xe->ras;
> +     struct xe_drm_ras_counter *info = ras->info[severity];
> +     struct xe_mmio *mmio = &tile->mmio;
> +     unsigned long err_stat = 0;
> +     int i, len;
> +
> +     if (xe->info.platform != XE_PVC)
> +             return;
> +
> +     if (!info)
> +             return;

Since info allocation is not related to hardware, we shouldn't even be
at this point without it. So let's not hide bugs and fail probe instead.

> +     if (hw_err == HARDWARE_ERROR_NONFATAL) {
> +             atomic_inc(&info[error_id].counter);
> +             log_hw_error(tile, info[error_id].name, severity);
> +             return;
> +     }

...

>  static void hw_error_source_handler(struct xe_tile *tile, const enum 
> hardware_error hw_err)
>  {
>       const enum drm_xe_ras_error_severity severity = 
> hw_err_to_severity(hw_err);
>       const char *severity_str = error_severity[severity];
>       struct xe_device *xe = tile_to_xe(tile);
> -     unsigned long flags;
> -     u32 err_src;
> +     struct xe_drm_ras *ras = &xe->ras;
> +     struct xe_drm_ras_counter *info = ras->info[severity];
> +     unsigned long flags, err_src;
> +     u32 err_bit;
>  
> -     if (xe->info.platform != XE_BATTLEMAGE)
> +     if (!IS_DGFX(xe))
>               return;
>  
>       spin_lock_irqsave(&xe->irq.lock, flags);
> @@ -108,11 +242,53 @@ static void hw_error_source_handler(struct xe_tile 
> *tile, const enum hardware_er
>               goto unlock;
>       }
>  
> -     if (err_src & XE_CSC_ERROR)
> +     /*
> +      * On encountering CSC firmware errors, the graphics device becomes 
> unrecoverable
> +      * so return immediately on error. The only way to recover from these 
> errors is
> +      * firmware flash. The device will enter Runtime Survivability mode 
> when such
> +      * errors are detected.
> +      */
> +     if (err_src & XE_CSC_ERROR) {
>               csc_hw_error_handler(tile, hw_err);
> +             goto clear_reg;
> +     }
>  
> -     xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
> +     if (!info)
> +             goto clear_reg;

Same as above.

Raag

Re: [PATCH v5 4/5] drm/xe/xe_hw_error: Add support for Core-Compute errors

Reply via email to