Frederic Barrat <fbar...@linux.ibm.com> writes:

> If the kernel is notified of an HMI caused by the NPU2, it's currently
> not being recognized and it logs the default message:
>
>     Unknown Malfunction Alert of type 3
>
> The NPU on Power 9 has 3 Fault Isolation Registers, so that's a lot of
> possible causes, but we should at least log that it's an NPU problem
> and report which FIR and which bit were raised if opal gave us the
> information.
>
> Signed-off-by: Frederic Barrat <fbar...@linux.ibm.com>
> ---
>
> Could be merged independently from (the opal-api.h change is already
> in the skiboot tree), but works better with, the matching skiboot
> change:
> http://patchwork.ozlabs.org/patch/1104076/

Well it *must* work with or without the skiboot change, because old/new
kernels will run on old/new skiboots.

It looks like it will work fine, we just won't get any extra information
in xstop_reason, right?

cheers

> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index e1577cfa7186..2492fe248e1e 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -568,6 +568,7 @@ enum OpalHMI_XstopType {
>       CHECKSTOP_TYPE_UNKNOWN  =       0,
>       CHECKSTOP_TYPE_CORE     =       1,
>       CHECKSTOP_TYPE_NX       =       2,
> +     CHECKSTOP_TYPE_NPU      =       3
>  };
>  
>  enum OpalHMI_CoreXstopReason {
> diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c 
> b/arch/powerpc/platforms/powernv/opal-hmi.c
> index 586ec71a4e17..de12a240b477 100644
> --- a/arch/powerpc/platforms/powernv/opal-hmi.c
> +++ b/arch/powerpc/platforms/powernv/opal-hmi.c
> @@ -149,6 +149,43 @@ static void print_nx_checkstop_reason(const char *level,
>                                       xstop_reason[i].description);
>  }
>  
> +static void print_npu_checkstop_reason(const char *level,
> +                                     struct OpalHMIEvent *hmi_evt)
> +{
> +     uint8_t reason, reason_count, i;
> +
> +     /*
> +      * We may not have a checkstop reason on some combination of
> +      * hardware and/or skiboot version
> +      */
> +     if (!hmi_evt->u.xstop_error.xstop_reason) {
> +             printk("%s      NPU checkstop on chip %x\n", level,
> +                     be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
> +             return;
> +     }
> +
> +     /*
> +      * NPU2 has 3 FIRs. Reason encoded on a byte as:
> +      *   2 bits for the FIR number
> +      *   6 bits for the bit number
> +      * It may be possible to find several reasons.
> +      *
> +      * We don't display a specific message per FIR bit as there
> +      * are too many and most are meaningless without the workbook
> +      * and/or hw team help anyway.
> +      */
> +     reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
> +             sizeof(reason);
> +     for (i = 0; i < reason_count; i++) {
> +             reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 
> 0xFF;
> +             if (reason)
> +                     printk("%s      NPU checkstop on chip %x: FIR%d bit %d 
> is set\n",
> +                             level,
> +                             be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
> +                             reason >> 6, reason & 0x3F);
> +     }
> +}
> +
>  static void print_checkstop_reason(const char *level,
>                                       struct OpalHMIEvent *hmi_evt)
>  {
> @@ -160,6 +197,9 @@ static void print_checkstop_reason(const char *level,
>       case CHECKSTOP_TYPE_NX:
>               print_nx_checkstop_reason(level, hmi_evt);
>               break;
> +     case CHECKSTOP_TYPE_NPU:
> +             print_npu_checkstop_reason(level, hmi_evt);
> +             break;
>       default:
>               printk("%s      Unknown Malfunction Alert of type %d\n",
>                      level, type);
> -- 
> 2.21.0

Reply via email to