Hi Ingo,

this is ontop of tip/x86/ras. Please pull,

Thanks.

The following changes since commit c9ce8712838e48bf356144122c5ecdcdac5d1829:

  x86/mce: Reindent __mcheck_cpu_apply_quirks() properly (2015-03-23 10:16:44 
+0100)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git tags/amd_severity

for you to fetch changes up to 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f:

  x86/mce: Define mce_severity function pointer (2015-03-24 12:14:15 +0100)

----------------------------------------------------------------
This has been long in the making - an AMD-specific MCE-severity grading
function. And it is actually readable at a quick glance. Further error
recovery actions will be based on its output.

Patches tested on every relevant AMD family out there.

----------------------------------------------------------------
Aravind Gopalakrishnan (2):
      x86/mce: Add an AMD severities-grading function
      x86/mce: Define mce_severity function pointer

 arch/x86/include/asm/mce.h                |  8 ++++
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 67 ++++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 10 +++++
 4 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fd38a23e729f..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
        u32 rip_msr;
 };
 
+struct mce_vendor_flags {
+       __u64           overflow_recov  : 1, /* cpuid_ebx(80000007) */
+                       __reserved_0    : 63;
+};
+extern struct mce_vendor_flags mce_flags;
+
 extern struct mca_config mca_cfg;
 extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_vendor_init_severity(void);
 #else
 static inline int mcheck_init(void) { return 0; }
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_vendor_init_severity(void) {}
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h 
b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index e12f0bfb45c1..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,7 +24,7 @@ struct mce_bank {
        char                    attrname[ATTR_LEN];     /* attribute name */
 };
 
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool 
is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
 extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..155c9261d3ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,62 @@ static int error_context(struct mce *m)
        return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
 }
 
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool 
is_excp)
+{
+       enum context ctx = error_context(m);
+
+       /* Processor Context Corrupt, no need to fumble too much, die! */
+       if (m->status & MCI_STATUS_PCC)
+               return MCE_PANIC_SEVERITY;
+
+       if (m->status & MCI_STATUS_UC) {
+
+               /*
+                * On older systems where overflow_recov flag is not present, we
+                * should simply panic if an error overflow occurs. If
+                * overflow_recov flag is present and set, then software can try
+                * to at least kill process to prolong system operation.
+                */
+               if (mce_flags.overflow_recov) {
+                       /* software can try to contain */
+                       if (!(m->mcgstatus & MCG_STATUS_RIPV))
+                               if (ctx == IN_KERNEL)
+                                       return MCE_PANIC_SEVERITY;
+
+                               /* kill current process */
+                               return MCE_AR_SEVERITY;
+               } else {
+                       /* at least one error was not logged */
+                       if (m->status & MCI_STATUS_OVER)
+                               return MCE_PANIC_SEVERITY;
+               }
+
+               /*
+                * For any other case, return MCE_UC_SEVERITY so that we log the
+                * error and exit #MC handler.
+                */
+               return MCE_UC_SEVERITY;
+       }
+
+       /*
+        * deferred error: poll handler catches these and adds to mce_ring so
+        * memory-failure can take recovery actions.
+        */
+       if (m->status & MCI_STATUS_DEFERRED)
+               return MCE_DEFERRED_SEVERITY;
+
+       /*
+        * corrected error: poll handler catches these and passes responsibility
+        * of decoding the error to EDAC
+        */
+       return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool 
is_excp)
 {
        enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
        enum context ctx = error_context(m);
@@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, 
bool is_excp)
        }
 }
 
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+                   mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               mce_severity = mce_severity_amd;
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void *s_start(struct seq_file *f, loff_t *pos)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8548b714a16b..c7df30748629 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
 struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
 
 struct mca_config mca_cfg __read_mostly = {
        .bootlog  = -1,
@@ -1535,6 +1536,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 
*c)
                        mce_banks[0].ctl = 0;
 
                /*
+                * overflow_recov is supported for F15h Models 00h-0fh
+                * even though we don't have a CPUID bit for it.
+                */
+               if (c->x86 == 0x15 && c->x86_model <= 0xf)
+                       mce_flags.overflow_recov = 1;
+
+               /*
                 * Turn off MC4_MISC thresholding banks on those models since
                 * they're not supported there.
                 */
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 
*c)
                break;
        case X86_VENDOR_AMD:
                mce_amd_feature_init(c);
+               mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
                break;
        default:
                break;
@@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
        mcheck_intel_therm_init();
+       mcheck_vendor_init_severity();
 
        return 0;
 }

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to