On 06/08/2018 07:18 AM, Nicholas Piggin wrote:
> On Thu, 07 Jun 2018 22:58:55 +0530
> Mahesh J Salgaonkar <mah...@linux.vnet.ibm.com> wrote:
> 
>> From: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com>
>>
>> If we get a machine check exceptions due to SLB errors then dump the
>> current SLB contents which will be very much helpful in debugging the
>> root cause of SLB errors. On pseries, as of today system crashes on SLB
>> errors. These are soft errors and can be fixed by flushing the SLBs so
>> the kernel can continue to function instead of system crash. This patch
>> fixes that also.
> 
> So pseries never flushed SLB and reloaded in response to multi hit
> errors? This seems like quite a good improvement then. I like
> dumping SLB too.
> 
> It's a bit annoying we can't share the same code with xmon really,
> that's okay but I just suggest commenting them both if you take a
> copy like this with a note to keep them in synch if you re-post
> the series.
> 
>>
>> With this patch the console will log SLB contents like below on SLB MCE
>> errors:
>>
>> [  822.711728] slb contents:
> 
> Suggest keeping the same format as the xmon dump (in particular
> CPU number, even though it's probably printed elsewhere in the MCE
> message it doesn't hurt.

Sure will do that and repost.

Thanks,
-Mahesh.

> 
> Reviewed-by: Nicholas Piggin <npig...@gmail.com>
> 
> Thanks,
> Nick
> 
>> [  822.711730] 00 c000000008000000 400ea1b217000500
>> [  822.711731]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
>> [  822.711732] 01 d000000008000000 400d43642f000510
>> [  822.711733]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>> [  822.711734] 09 f000000008000000 400a86c85f000500
>> [  822.711736]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
>> [  822.711737] 10 00007f0008000000 400d1f26e3000d90
>> [  822.711738]   1T  ESID=       7f  VSID=      d1f26e3 LLP:110
>> [  822.711739] 11 0000000018000000 000e3615f520fd90
>> [  822.711740]  256M ESID=        1  VSID=   e3615f520f LLP:110
>> [  822.711740] 12 d000000008000000 400d43642f000510
>> [  822.711741]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>> [  822.711742] 13 d000000008000000 400d43642f000510
>> [  822.711743]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>
>>
>> Suggested-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
>> Suggested-by: Michael Ellerman <m...@ellerman.id.au>
>> Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    1 +
>>  arch/powerpc/mm/slb.c                         |   35 
>> +++++++++++++++++++++++++
>>  arch/powerpc/platforms/pseries/ras.c          |   29 ++++++++++++++++++++-
>>  3 files changed, 64 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
>> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> index 50ed64fba4ae..c0da68927235 100644
>> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> @@ -487,6 +487,7 @@ extern void hpte_init_native(void);
>>  
>>  extern void slb_initialize(void);
>>  extern void slb_flush_and_rebolt(void);
>> +extern void slb_dump_contents(void);
>>  
>>  extern void slb_vmalloc_update(void);
>>  extern void slb_set_size(u16 size);
>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>> index 66577cc66dc9..799aa117cec3 100644
>> --- a/arch/powerpc/mm/slb.c
>> +++ b/arch/powerpc/mm/slb.c
>> @@ -145,6 +145,41 @@ void slb_flush_and_rebolt(void)
>>      get_paca()->slb_cache_ptr = 0;
>>  }
>>  
>> +void slb_dump_contents(void)
>> +{
>> +    int i;
>> +    unsigned long e, v;
>> +    unsigned long llp;
>> +
>> +    pr_err("slb contents:\n");
>> +    for (i = 0; i < mmu_slb_size; i++) {
>> +            asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
>> +            asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
>> +
>> +            if (!e && !v)
>> +                    continue;
>> +
>> +            pr_err("%02d %016lx %016lx", i, e, v);
>> +
>> +            if (!(e & SLB_ESID_V)) {
>> +                    pr_err("\n");
>> +                    continue;
>> +            }
>> +            llp = v & SLB_VSID_LLP;
>> +            if (v & SLB_VSID_B_1T) {
>> +                    pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
>> +                            GET_ESID_1T(e),
>> +                            (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
>> +                            llp);
>> +            } else {
>> +                    pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
>> +                            GET_ESID(e),
>> +                            (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
>> +                            llp);
>> +            }
>> +    }
>> +}
>> +
>>  void slb_vmalloc_update(void)
>>  {
>>      unsigned long vflags;
>> diff --git a/arch/powerpc/platforms/pseries/ras.c 
>> b/arch/powerpc/platforms/pseries/ras.c
>> index 2edc673be137..e56759d92356 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -422,6 +422,31 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>>      return 0; /* need to perform reset */
>>  }
>>  
>> +static int mce_handle_error(struct rtas_error_log *errp)
>> +{
>> +    struct pseries_errorlog *pseries_log;
>> +    struct pseries_mc_errorlog *mce_log;
>> +    int disposition = rtas_error_disposition(errp);
>> +    uint8_t error_type;
>> +
>> +    pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
>> +    if (pseries_log == NULL)
>> +            goto out;
>> +
>> +    mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
>> +    error_type = rtas_mc_error_type(mce_log);
>> +
>> +    if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
>> +                    (error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
>> +            slb_dump_contents();
>> +            slb_flush_and_rebolt();
>> +            disposition = RTAS_DISP_FULLY_RECOVERED;
>> +    }
>> +
>> +out:
>> +    return disposition;
>> +}
>> +
>>  /*
>>   * See if we can recover from a machine check exception.
>>   * This is only called on power4 (or above) and only via
>> @@ -434,7 +459,9 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>>  static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
>>  {
>>      int recovered = 0;
>> -    int disposition = rtas_error_disposition(err);
>> +    int disposition;
>> +
>> +    disposition = mce_handle_error(err);
>>  
>>      if (!(regs->msr & MSR_RI)) {
>>              /* If MSR_RI isn't set, we cannot recover */
>>
> 

Reply via email to