Hello community, here is the log from the commit of package mcelog for openSUSE:Factory checked in at 2013-10-23 19:23:57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/mcelog (Old) and /work/SRC/openSUSE:Factory/.mcelog.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "mcelog" Changes: -------- --- /work/SRC/openSUSE:Factory/mcelog/mcelog.changes 2013-02-23 16:38:09.000000000 +0100 +++ /work/SRC/openSUSE:Factory/.mcelog.new/mcelog.changes 2013-10-23 19:23:59.000000000 +0200 @@ -1,0 +2,10 @@ +Tue Oct 15 17:02:13 UTC 2013 - [email protected] + +- Updated to latest git HEAD: + commit c7bf28088f056925c04d4fd5768504c59bbf19c4 + Author: Robin Holt <[email protected]> + Date: Mon Sep 16 04:30:02 2013 -0500 + Because upstream does not use proper tags/revisions, I now + versioned this one mcelog-1.1 + +------------------------------------------------------------------- Old: ---- mcelog-1.0pre3.6e4e2a000124.tar.bz2 New: ---- mcelog-1.1.tar.bz2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ mcelog.spec ++++++ --- /var/tmp/diff_new_pack.Z8WZuI/_old 2013-10-23 19:24:00.000000000 +0200 +++ /var/tmp/diff_new_pack.Z8WZuI/_new 2013-10-23 19:24:00.000000000 +0200 @@ -16,11 +16,12 @@ # + Name: mcelog Summary: Log Machine Check Events License: GPL-2.0+ Group: System/Monitoring -Version: 1.0pre3.6e4e2a000124 +Version: 1.1 Release: 0 ExclusiveArch: ix86 x86_64 BuildRequires: libesmtp-devel ++++++ email.patch ++++++ --- /var/tmp/diff_new_pack.Z8WZuI/_old 2013-10-23 19:24:00.000000000 +0200 +++ /var/tmp/diff_new_pack.Z8WZuI/_new 2013-10-23 19:24:00.000000000 +0200 @@ -7,10 +7,10 @@ msg.c | 8 ++ 6 files changed, 343 insertions(+), 2 deletions(-) -Index: mcelog-1.0pre3.6e4e2a000124/Makefile +Index: mcelog-1.1/Makefile =================================================================== ---- mcelog-1.0pre3.6e4e2a000124.orig/Makefile -+++ mcelog-1.0pre3.6e4e2a000124/Makefile +--- mcelog-1.1.orig/Makefile ++++ mcelog-1.1/Makefile @@ -1,3 +1,4 @@ +CONFIG_EMAIL := 1 CFLAGS := -g -Os @@ -18,7 +18,7 @@ etcprefix := @@ -34,7 +35,8 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co client.o cache.o sysfs.o yellow.o page.o rbtree.o \ - xeon75xx.o sandy-bridge.o + xeon75xx.o sandy-bridge.o ivy-bridge.o msr.o DISKDB_OBJ := diskdb.o dimm.o db.o -CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} +EMAIL_OBJ := email.o @@ -39,10 +39,10 @@ SRC := $(OBJ:.o=.c) mcelog: ${OBJ} -Index: mcelog-1.0pre3.6e4e2a000124/email.c +Index: mcelog-1.1/email.c =================================================================== --- /dev/null -+++ mcelog-1.0pre3.6e4e2a000124/email.c ++++ mcelog-1.1/email.c @@ -0,0 +1,199 @@ +#include <unistd.h> +#include <signal.h> @@ -111,7 +111,7 @@ + strncpy(c_recipient, email_env, MAX_STRING_LEN - 1); + return 1; + } -+ return 0; ++ return 0; +} + +/* Callback to prnt the recipient status */ @@ -243,10 +243,10 @@ + smtp_destroy_session (session); + return 0; +} -Index: mcelog-1.0pre3.6e4e2a000124/email.h +Index: mcelog-1.1/email.h =================================================================== --- /dev/null -+++ mcelog-1.0pre3.6e4e2a000124/email.h ++++ mcelog-1.1/email.h @@ -0,0 +1,34 @@ +#ifndef _MCELOG_EMAIL_H_ +#define _MCELOG_EMAIL_H_ @@ -282,10 +282,10 @@ +#endif + +#endif -Index: mcelog-1.0pre3.6e4e2a000124/mcelog.c +Index: mcelog-1.1/mcelog.c =================================================================== ---- mcelog-1.0pre3.6e4e2a000124.orig/mcelog.c -+++ mcelog-1.0pre3.6e4e2a000124/mcelog.c +--- mcelog-1.1.orig/mcelog.c ++++ mcelog-1.1/mcelog.c @@ -37,6 +37,7 @@ #include <assert.h> #include <signal.h> @@ -313,23 +313,23 @@ static char *inputfile; char *processor_flags; static int foreground; -@@ -914,6 +918,7 @@ void usage(void) - "--num-errors N Only process N errors (for testing)\n" +@@ -927,6 +931,7 @@ void usage(void) "--pidfile file Write pid of daemon into file\n" + "--no-imc-log Disable extended iMC logging\n" ); + email_usage(); diskdb_usage(); print_cputypes(); exit(1); -@@ -979,6 +984,7 @@ static struct option options[] = { - { "pidfile", 1, NULL, O_PIDFILE }, +@@ -994,6 +999,7 @@ static struct option options[] = { { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ + { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, DISKDB_OPTIONS + EMAIL_OPTIONS {} }; -@@ -1153,11 +1159,86 @@ static void drop_cred(void) +@@ -1171,11 +1177,86 @@ static void drop_cred(void) } } @@ -416,7 +416,7 @@ if (recordlen == 0) { Wprintf("no data in mce record\n"); -@@ -1177,12 +1258,16 @@ static void process(int fd, unsigned rec +@@ -1195,12 +1276,16 @@ static void process(int fd, unsigned rec finish = 1; if (!mce_filter(mce, recordlen)) continue; @@ -433,7 +433,7 @@ flushlog(); } -@@ -1293,6 +1378,8 @@ int main(int ac, char **av) +@@ -1311,6 +1396,8 @@ int main(int ac, char **av) exit(0); } else if (diskdb_cmd(opt, ac, av)) { exit(0); @@ -442,7 +442,7 @@ } else if (opt == 0) break; } -@@ -1301,6 +1388,10 @@ int main(int ac, char **av) +@@ -1339,6 +1426,10 @@ int main(int ac, char **av) logfn = av[optind++]; if (av[optind]) usage(); @@ -453,11 +453,11 @@ checkdmi(); general_setup(); -Index: mcelog-1.0pre3.6e4e2a000124/mcelog.h +Index: mcelog-1.1/mcelog.h =================================================================== ---- mcelog-1.0pre3.6e4e2a000124.orig/mcelog.h -+++ mcelog-1.0pre3.6e4e2a000124/mcelog.h -@@ -120,6 +120,7 @@ enum cputype { +--- mcelog-1.1.orig/mcelog.h ++++ mcelog-1.1/mcelog.h +@@ -123,6 +123,7 @@ enum cputype { enum option_ranges { O_COMMON = 500, O_DISKDB = 1000, @@ -465,10 +465,10 @@ }; enum syslog_opt { -Index: mcelog-1.0pre3.6e4e2a000124/msg.c +Index: mcelog-1.1/msg.c =================================================================== ---- mcelog-1.0pre3.6e4e2a000124.orig/msg.c -+++ mcelog-1.0pre3.6e4e2a000124/msg.c +--- mcelog-1.1.orig/msg.c ++++ mcelog-1.1/msg.c @@ -8,10 +8,13 @@ #include "mcelog.h" #include "msg.h" ++++++ mcelog-1.0pre3.6e4e2a000124.tar.bz2 -> mcelog-1.1.tar.bz2 ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/Makefile new/mcelog-1.1/Makefile --- old/mcelog-1.0pre3.6e4e2a000124/Makefile 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/Makefile 2013-09-20 16:33:19.000000000 +0200 @@ -32,7 +32,7 @@ nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ client.o cache.o sysfs.o yellow.o page.o rbtree.o \ - xeon75xx.o sandy-bridge.o + xeon75xx.o sandy-bridge.o ivy-bridge.o msr.o DISKDB_OBJ := diskdb.o dimm.o db.o CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} DOC := mce.pdf @@ -52,15 +52,15 @@ # dbquery intentionally not installed by default install: mcelog - mkdir -p ${etcprefix}/etc/mcelog ${prefix}/sbin ${prefix}/share/man/man8 - install -m 755 -p mcelog ${prefix}/sbin/mcelog - install -m 644 -p mcelog.8 ${prefix}/share/man/man8 - install -m 644 -p -b mcelog.conf ${etcprefix}/etc/mcelog/mcelog.conf + mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)${prefix}/share/man/man8 + install -m 755 -p mcelog $(DESTDIR)${prefix}/sbin/mcelog + install -m 644 -p mcelog.8 $(DESTDIR)${prefix}/share/man/man8 + install -m 644 -p -b mcelog.conf $(DESTDIR)${etcprefix}/etc/mcelog/mcelog.conf for i in ${TRIGGERS} ; do \ - install -m 755 -p -b triggers/$$i ${etcprefix}/etc/mcelog ; \ + install -m 755 -p -b triggers/$$i $(DESTDIR)${etcprefix}/etc/mcelog ; \ done ifdef DOCDIR - install -m 644 -p ${DOC} ${DOCDIR} + install -m 644 -p ${DOC} $(DESTDIR)${DOCDIR} else echo echo "Consider defining DOCDIR to install additional documentation" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/bitfield.h new/mcelog-1.1/bitfield.h --- old/mcelog-1.0pre3.6e4e2a000124/bitfield.h 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/bitfield.h 2013-09-20 16:33:19.000000000 +0200 @@ -30,3 +30,8 @@ #define MASK(x) ((1ULL << (1 + (x))) - 1) #define EXTRACT(v, a, b) (((v) >> (a)) & MASK((b)-(a))) + +static inline int test_prefix(int nr, __u32 value) +{ + return ((value >> nr) == 1); +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/cache.c new/mcelog-1.1/cache.c --- old/mcelog-1.0pre3.6e4e2a000124/cache.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/cache.c 2013-09-20 16:33:19.000000000 +0200 @@ -61,13 +61,15 @@ static unsigned cpumap_len(char *s) { - unsigned len = 0; - while (*s) { + unsigned len = 0, width = 0; + do { if (isxdigit(*s)) - len++; - s++; - } - len = round_up(len * 4, BITS_PER_INT) / 8; + width++; + else { + len += round_up(width * 4, BITS_PER_INT) / 8; + width = 0; + } + } while (*s++); return len; } @@ -118,22 +120,23 @@ int numindex; asprintf(&fn, "%s/%s/cache", PREFIX, de->d_name); - stat(fn, &st); - numindex = st.st_nlink - 2; - if (numindex < 0) - numindex = MIN_INDEX; - if (cachelen <= cpu) - more_cpus(cpu); - caches[cpu] = xalloc(sizeof(struct cache) * - (numindex+1)); - for (i = 0; i < numindex; i++) { - char *cfn; - struct cache *c = caches[cpu] + i; - asprintf(&cfn, "%s/index%d", fn, i); - c->type = read_field_map(cfn, "type", type_map); - c->level = read_field_num(cfn, "level"); - read_cpu_map(c, cfn); - free(cfn); + if (!stat(fn, &st)) { + numindex = st.st_nlink - 2; + if (numindex < 0) + numindex = MIN_INDEX; + if (cachelen <= cpu) + more_cpus(cpu); + caches[cpu] = xalloc(sizeof(struct cache) * + (numindex+1)); + for (i = 0; i < numindex; i++) { + char *cfn; + struct cache *c = caches[cpu] + i; + asprintf(&cfn, "%s/index%d", fn, i); + c->type = read_field_map(cfn, "type", type_map); + c->level = read_field_num(cfn, "level"); + read_cpu_map(c, cfn); + free(cfn); + } } free(fn); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/dmi.c new/mcelog-1.1/dmi.c --- old/mcelog-1.0pre3.6e4e2a000124/dmi.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/dmi.c 2013-09-20 16:33:19.000000000 +0200 @@ -48,7 +48,7 @@ char fmt[5]; char str2[5]; /* _DMI_ */ char csum2; - short length; + unsigned short length; unsigned table; unsigned short numentries; char bcdrev; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/intel.c new/mcelog-1.1/intel.c --- old/mcelog-1.0pre3.6e4e2a000124/intel.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/intel.c 2013-09-20 16:33:19.000000000 +0200 @@ -22,6 +22,8 @@ #include "nehalem.h" #include "memdb.h" #include "page.h" +#include "sandy-bridge.h" +#include "ivy-bridge.h" #include "xeon75xx.h" int memory_error_support; @@ -29,7 +31,9 @@ void intel_cpu_init(enum cputype cpu) { if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || - cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP) + cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || + cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || + cpu == CPU_HASWELL) memory_error_support = 1; } @@ -50,7 +54,8 @@ return CPU_CORE2; else if (model == 0x1d) return CPU_DUNNINGTON; - else if (model == 0x1a || model == 0x2c || model == 0x1e) + else if (model == 0x1a || model == 0x2c || model == 0x1e || + model == 0x25) return CPU_NEHALEM; else if (model == 0x2e || model == 0x2f) return CPU_XEON75XX; @@ -58,14 +63,20 @@ return CPU_SANDY_BRIDGE; else if (model == 0x2d) return CPU_SANDY_BRIDGE_EP; + else if (model == 0x3a) + return CPU_IVY_BRIDGE; + else if (model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; + else if (model == 0x3c || model == 0x45 || model == 0x46) + return CPU_HASWELL; if (model > 0x1a) { - Eprintf("Unsupported new Family 6 Model %x CPU: only decoding architectural errors\n", + Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", model); return CPU_INTEL; } } if (family > 6) { - Eprintf("Unsupported new Family %u Model %x CPU: only decoding architectural errors\n", + Eprintf("Family %u Model %x CPU: only decoding architectural errors\n", family, model); return CPU_INTEL; } @@ -97,6 +108,12 @@ case CPU_XEON75XX: xeon75xx_memory_error(m, recordlen, channel, dimm); break; + case CPU_SANDY_BRIDGE_EP: + sandy_bridge_ep_memerr_misc(m, channel, dimm); + break; + case CPU_IVY_BRIDGE_EPEX: + ivy_bridge_ep_memerr_misc(m, channel, dimm); + break; default: break; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/intel.h new/mcelog-1.1/intel.h --- old/mcelog-1.0pre3.6e4e2a000124/intel.h 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/intel.h 2013-09-20 16:33:19.000000000 +0200 @@ -15,5 +15,8 @@ case CPU_INTEL: \ case CPU_XEON75XX: \ case CPU_SANDY_BRIDGE_EP: \ - case CPU_SANDY_BRIDGE + case CPU_SANDY_BRIDGE: \ + case CPU_IVY_BRIDGE: \ + case CPU_IVY_BRIDGE_EPEX: \ + case CPU_HASWELL diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/ivy-bridge.c new/mcelog-1.1/ivy-bridge.c --- old/mcelog-1.0pre3.6e4e2a000124/ivy-bridge.c 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.1/ivy-bridge.c 2013-09-20 16:33:19.000000000 +0200 @@ -0,0 +1,159 @@ +/* Copyright (C) 2013 Intel Corporation + Decode Intel Ivy Bridge specific machine check errors. + + mcelog is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; version + 2. + + mcelog is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should find a copy of v2 of the GNU General Public License somewhere + on your Linux system; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + Author: Tony Luck +*/ + +#include "mcelog.h" +#include "bitfield.h" +#include "ivy-bridge.h" +#include "memdb.h" + +/* See IA32 SDM Vol3B Table 16-17 */ + +static char *pcu_1[] = { + [0] = "No error", + [1] = "Non_IMem_Sel", + [2] = "I_Parity_Error", + [3] = "Bad_OpCode", + [4] = "I_Stack_Underflow", + [5] = "I_Stack_Overflow", + [6] = "D_Stack_Underflow", + [7] = "D_Stack_Overflow", + [8] = "Non-DMem_Sel", + [9] = "D_Parity_Error" +}; + +static char *pcu_2[] = { + [0x00] = "No Error", + [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", + [0x0E] = "MC_MC_CPD_UNCPD_ST_TIMEOUT", + [0x0F] = "MC_PKGS_SAFE_WP_TIMEOUT", + [0x43] = "MC_PECI_MAILBOX_QUIESCE_TIMEOUT", + [0x44] = "MC_CRITICAL_VR_FAILED", + [0x45] = "MC_ICC_MAX-NOTSUPPORTED", + [0x5C] = "MC_MORE_THAN_ONE_LT_AGENT", + [0x60] = "MC_INVALID_PKGS_REQ_PCH", + [0x61] = "MC_INVALID_PKGS_REQ_QPI", + [0x62] = "MC_INVALID_PKGS_RES_QPI", + [0x63] = "MC_INVALID_PKGC_RES_PCH", + [0x64] = "MC_INVALID_PKG_STATE_CONFIG", + [0x70] = "MC_WATCHDG_TIMEOUT_PKGC_SLAVE", + [0x71] = "MC_WATCHDG_TIMEOUT_PKGC_MASTER", + [0x72] = "MC_WATCHDG_TIMEOUT_PKGS_MASTER", + [0x7A] = "MC_HA_FAILSTS_CHANGE_DETECTED", + [0x7B] = "MC_PCIE_R2PCIE-RW_BLOCK_ACK_TIMEOUT", + [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT", +}; + +static struct field pcu_mc4[] = { + FIELD(16, pcu_1), + FIELD(24, pcu_2), + {} +}; + +/* See IA32 SDM Vol3B Table 16-18 */ + +static char *memctrl_1[] = { + [0x001] = "Address parity error", + [0x002] = "HA Wrt buffer Data parity error", + [0x004] = "HA Wrt byte enable parity error", + [0x008] = "Corrected patrol scrub error", + [0x010] = "Uncorrected patrol scrub error", + [0x020] = "Corrected spare error", + [0x040] = "Uncorrected spare error", + [0x080] = "Corrected memory read error", + [0x100] = "iMC, WDB, parity errors", +}; + +static struct field memctrl_mc9[] = { + FIELD(16, memctrl_1), + {} +}; + +void ivb_decode_model(int cputype, int bank, u64 status, u64 misc) +{ + switch (bank) { + case 4: + Wprintf("PCU: "); + decode_bitfield(status, pcu_mc4); + Wprintf("\n"); + break; + case 5: + if (cputype == CPU_IVY_BRIDGE_EPEX) { + /* MCACOD already decoded */ + Wprintf("QPI\n"); + } + break; + case 9: case 10: case 11: case 12: + case 13: case 14: case 15: case 16: + Wprintf("MemCtrl: "); + decode_bitfield(status, memctrl_mc9); + Wprintf("\n"); + break; + } +} + +/* + * Ivy Bridge EP and EX processors (family 6, model 62) support additional + * logging for corrected errors in the integrated memory controller (IMC) + * banks. The mode is off by default, but can be enabled by setting the + * "MemError Log Enable" * bit in MSR_ERROR_CONTROL (MSR 0x17f). + * The SDM leaves it as an exercise for the reader to convert the + * faling rank to a DIMM slot. + */ +static int failrank2dimm(unsigned failrank, int socket, int channel) +{ + switch (failrank) { + case 0: case 1: case 2: case 3: + return 0; + case 4: case 5: + return 1; + case 6: case 7: + if (get_memdimm(socket, channel, 2, 0)) + return 2; + else + return 1; + } + return -1; +} + +void ivy_bridge_ep_memerr_misc(struct mce *m, int *channel, int *dimm) +{ + u64 status = m->status; + unsigned failrank, chan; + + /* Ignore unless this is an corrected extended error from an iMC bank */ + if (!imc_log || m->bank < 9 || m->bank > 16 || (status & MCI_STATUS_UC) || + !test_prefix(7, status & 0xefff)) + return; + + chan = EXTRACT(status, 0, 3); + if (chan == 0xf) + return; + + if (EXTRACT(m->misc, 62, 62)) { + failrank = EXTRACT(m->misc, 46, 50); + dimm[0] = failrank2dimm(failrank, m->socketid, chan); + channel[0] = chan; + } + if (EXTRACT(m->misc, 63, 63)) { + failrank = EXTRACT(m->misc, 51, 55); + dimm[1] = failrank2dimm(failrank, m->socketid, chan); + channel[1] = chan; + } +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/ivy-bridge.h new/mcelog-1.1/ivy-bridge.h --- old/mcelog-1.0pre3.6e4e2a000124/ivy-bridge.h 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.1/ivy-bridge.h 2013-09-20 16:33:19.000000000 +0200 @@ -0,0 +1,2 @@ +void ivb_decode_model(int cputype, int bank, u64 status, u64 misc); +void ivy_bridge_ep_memerr_misc(struct mce *m, int *channel, int *dimm); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/leaky-bucket.c new/mcelog-1.1/leaky-bucket.c --- old/mcelog-1.0pre3.6e4e2a000124/leaky-bucket.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/leaky-bucket.c 2013-09-20 16:33:19.000000000 +0200 @@ -29,16 +29,15 @@ time_t now) { long diff; + unsigned age; + diff = now - b->tstamp; - if (diff >= c->agetime) { - unsigned age = (diff / (double)c->agetime) * c->capacity; - b->tstamp = now; - if (age > b->count) - b->count = 0; - else - b->count -= age; - b->excess = 0; - } + age = (diff / (double)c->agetime) * c->capacity; + + if (age > b->count) + b->count = 0; + else + b->count -= age; } /* Account increase in leaky bucket. Return 1 if bucket overflowed. */ @@ -48,11 +47,9 @@ if (c->capacity == 0) return 0; bucket_age(c, b, t); - b->count += inc; - if (b->count >= c->capacity) { - b->excess += b->count; - /* should disable overflow completely in the same time unit */ - b->count = 0; + if (b->count < c->capacity) { + b->count += inc; + b->tstamp = t; return 1; } return 0; @@ -69,7 +66,7 @@ unsigned corr = 1; switch (unit) { case 'd': corr *= 24; - case 'h': corr *= 3600; + case 'h': corr *= 60; case 'm': corr *= 60; case 0: break; default: return -1; @@ -151,3 +148,70 @@ b->excess = 0; b->tstamp = bucket_time(); } + + +#ifdef TEST_LEAKY_BUCKET +/* Stolen from the cpp documentation */ +#define xstr(_s) str(_s) +#define str(_s) #_s + +#define THRESHOLD_EVENTS_PER_PERIOD 100 +#define EVENTS_PER_LOGGED_EVENT 10 +#define SECONDS_PER_EVENT 86 +/* Needs to be SECONDS_PER_EVENT * EVENTS_PER_LOGGED_EVENT * THRESHOLD_EVENTS_PER_PERIOD */ +#define THRESHOLD_PERIOD 86000 +#if THRESHOLD_PERIOD != (SECONDS_PER_EVENT * EVENTS_PER_LOGGED_EVENT * THRESHOLD_EVENTS_PER_PERIOD) +# error THRESHOLD_PERIOD is Wrong! +#endif +#define RATE_STRING xstr(THRESHOLD_EVENTS_PER_PERIOD) " / " xstr(THRESHOLD_PERIOD) + +#define EVENTS_PER_PERIOD_IN_TEST (THRESHOLD_EVENTS_PER_PERIOD * EVENTS_PER_LOGGED_EVENT) + +#define PERIODS_TO_TEST 3 +#define TOTAL_SECONDS_FOR_TEST (PERIODS_TO_TEST * THRESHOLD_PERIOD) +#define TOTAL_EVENTS (PERIODS_TO_TEST * EVENTS_PER_PERIOD_IN_TEST) + +int main(int argc, char **argv) +{ + struct bucket_conf c; + struct leaky_bucket b; + time_t start_time; + time_t event_time; + int ret; + int i; + +#ifdef TEST_LEAKY_BUCKET_DEBUG + printf("Testing with a rate of " RATE_STRING "\n"); +#endif + ret = bucket_conf_init(&c, RATE_STRING); + if (ret) + return ret; + + bucket_init(&b); + start_time = b.tstamp; + + for (i = 1; i <= TOTAL_EVENTS; i++) { + event_time = start_time + i * SECONDS_PER_EVENT; + ret = __bucket_account(&c, &b, 1, event_time); + +#ifdef TEST_LEAKY_BUCKET_DEBUG + if (ret) + printf("Logging entry %d at %ld %ld\n", i, event_time - start_time, b.tstamp); +#else + if (i < THRESHOLD_EVENTS_PER_PERIOD) { + if (!ret){ + fprintf(stderr, "Did not log initial events - FAIL.\n"); + return -1; + } + } else { + if (!(i % EVENTS_PER_LOGGED_EVENT) && !ret) { + fprintf(stderr, "Did not log initial events - FAIL.\n"); + return -1; + } + } +#endif + } + + return 0; +} +#endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/mcelog.8 new/mcelog-1.1/mcelog.8 --- old/mcelog-1.0pre3.6e4e2a000124/mcelog.8 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/mcelog.8 2013-09-20 16:33:19.000000000 +0200 @@ -214,6 +214,13 @@ .I file. Only valid in daemon mode. +Mcelog will enable extended error reporting from the memory +controller on processors that support it unless you tell it +not to with the +.B \-\-no-imc-log +option. You might need this option when decoding old logs +from a system where this mode was not enabled. + .\".B \-\-database filename .\"specifies the memory module error database file. Default is .\"/var/lib/memory-errors. It is only used together with DMI decoding. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/mcelog.c new/mcelog-1.1/mcelog.c --- old/mcelog-1.0pre3.6e4e2a000124/mcelog.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/mcelog.c 2013-09-20 16:33:19.000000000 +0200 @@ -82,8 +82,9 @@ static char *pidfile = pidfile_default; static char *logfile; static int debug_numerrors; +int imc_log = -1; -static void check_cpu(void); +static int is_cpu_supported(void); static void disclaimer(void) @@ -224,6 +225,9 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series", [CPU_SANDY_BRIDGE] = "Sandy Bridge", /* Fill in better name */ [CPU_SANDY_BRIDGE_EP] = "Sandy Bridge EP", /* Fill in better name */ + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", /* Fill in better name */ }; static struct config_choice cpu_choices[] = { @@ -256,6 +260,10 @@ { "xeon7100", CPU_P4 }, { "sandybridge", CPU_SANDY_BRIDGE }, /* Fill in better name */ { "sandybridge-ep", CPU_SANDY_BRIDGE_EP }, /* Fill in better name */ + { "ivybridge", CPU_IVY_BRIDGE }, /* Fill in better name */ + { "ivybridge-ep", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "ivybridge-ex", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */ + { "haswell", CPU_HASWELL }, /* Fill in better name */ {} }; @@ -346,7 +354,7 @@ warned = 1; } } else if (cputype == CPU_GENERIC && !cpu_forced) { - check_cpu(); + is_cpu_supported(); } } @@ -416,7 +424,8 @@ fam, mod); } - resolveaddr(m->addr); + if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX) + resolveaddr(m->addr); if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { diskdb_resolve_addr(m->addr); } @@ -448,7 +457,7 @@ Wprintf("\n"); } -void check_cpu(void) +int is_cpu_supported(void) { enum { VENDOR = 1, @@ -462,7 +471,7 @@ static int checked; if (checked) - return; + return 1; checked = 1; f = fopen("/proc/cpuinfo","r"); @@ -498,12 +507,13 @@ } if (seen == ALL) { - if (cpu_forced) - ; - else if (!strcmp(vendor,"AuthenticAMD") && - (family == 15 || family == 16 || family == 17)) - cputype = CPU_K8; - else if (!strcmp(vendor,"GenuineIntel")) + if (!strcmp(vendor,"AuthenticAMD")) { + if (family == 15) + cputype = CPU_K8; + if (family >= 15) + SYSERRprintf("AMD Processor family %d: Please load edac_mce_amd module.\n", family); + return 0; + } else if (!strcmp(vendor,"GenuineIntel")) cputype = select_intel_cputype(family, model); /* Add checks for other CPUs here */ } else { @@ -513,6 +523,8 @@ free(line); } else Eprintf("warning: Cannot open /proc/cpuinfo\n"); + + return 1; } static char *skipspace(char *s) @@ -913,6 +925,7 @@ "--foreground Keep in foreground (for debugging)\n" "--num-errors N Only process N errors (for testing)\n" "--pidfile file Write pid of daemon into file\n" +"--no-imc-log Disable extended iMC logging\n" ); diskdb_usage(); print_cputypes(); @@ -946,6 +959,7 @@ O_NUMERRORS, O_PIDFILE, O_DEBUG_NUMERRORS, + O_NO_IMC_LOG, }; static struct option options[] = { @@ -978,6 +992,7 @@ { "num-errors", 1, NULL, O_NUMERRORS }, { "pidfile", 1, NULL, O_PIDFILE }, { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ + { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, DISKDB_OPTIONS {} }; @@ -1082,6 +1097,9 @@ case O_DEBUG_NUMERRORS: debug_numerrors = 1; break; + case O_NO_IMC_LOG: + imc_log = 0; + break; case 0: break; default: @@ -1296,6 +1314,26 @@ } else if (opt == 0) break; } + + /* before doing anything else let's see if the CPUs are supported */ + if (!cpu_forced && !is_cpu_supported()) { + fprintf(stderr, "CPU is unsupported\n"); + exit(1); + } + + /* If the user didn't tell us not to use iMC logging, check if CPU supports it */ + if (imc_log == -1) { + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + imc_log = 1; + break; + default: + imc_log = 0; + break; + } + } + modifier_finish(); if (av[optind]) logfn = av[optind++]; @@ -1319,12 +1357,13 @@ d.buf = xalloc(d.recordlen * d.loglen); if (daemon_mode) { - check_cpu(); prefill_memdb(); if (!do_dmi) closedmi(); server_setup(); page_setup(); + if (imc_log) + set_imc_log(cputype); drop_cred(); register_pollcb(fd, POLLIN, process_mcefd, &d); if (!foreground && daemon(0, need_stdout()) < 0) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/mcelog.h new/mcelog-1.1/mcelog.h --- old/mcelog-1.0pre3.6e4e2a000124/mcelog.h 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/mcelog.h 2013-09-20 16:33:19.000000000 +0200 @@ -115,6 +115,9 @@ CPU_XEON75XX, CPU_SANDY_BRIDGE, CPU_SANDY_BRIDGE_EP, + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, }; enum option_ranges { @@ -139,3 +142,5 @@ extern int syslog_level; extern enum cputype cputype; extern int filter_memory_errors; +extern int imc_log; +extern void set_imc_log(int cputype); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/memdb.c new/mcelog-1.1/memdb.c --- old/mcelog-1.0pre3.6e4e2a000124/memdb.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/memdb.c 2013-09-20 16:33:19.000000000 +0200 @@ -77,18 +77,19 @@ } /* Search DIMM in hash table */ -struct memdimm *get_memdimm(int socketid, int channel, int dimm) +struct memdimm *get_memdimm(int socketid, int channel, int dimm, int insert) { struct memdimm *md; - unsigned h = dimmhash(socketid, dimm, channel); + unsigned h; + h = dimmhash(socketid, dimm, channel); for (md = md_dimms[h]; md; md = md->next) { if (md->socketid == socketid && md->channel == channel && md->dimm == dimm) break; } - if (md) + if (md || !insert) return md; md = xalloc(sizeof(struct memdimm)); @@ -239,12 +240,12 @@ } if (memdb_enabled && (ch != -1 || dimm != -1)) { - md = get_memdimm(m->socketid, ch, dimm); + md = get_memdimm(m->socketid, ch, dimm, 1); account_memdb(&dimms, md, m); } if (sockdb_enabled) { - md = get_memdimm(m->socketid, -1, -1); + md = get_memdimm(m->socketid, -1, -1, 1); account_over(&sockets, md, m, corr_err_cnt); account_memdb(&sockets, md, m); } @@ -373,6 +374,9 @@ if (sscanf(bl + strcspn(bl, "_"), "_Node%u_Channel%u_Dimm%u", socketid, channel, dimm) == 3) return 1; + if (sscanf(bl, "NODE %u CHANNEL %u DIMM %u", socketid, + channel, dimm) == 3) + return 1; /* Add more DMI formats here */ return 0; } @@ -407,7 +411,7 @@ continue; } - md = get_memdimm(socketid, channel, dimm); + md = get_memdimm(socketid, channel, dimm, 1); if (md->memdev) { /* dups -- likely parse error */ missed++; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/memdb.h new/mcelog-1.1/memdb.h --- old/mcelog-1.0pre3.6e4e2a000124/memdb.h 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/memdb.h 2013-09-20 16:33:19.000000000 +0200 @@ -21,4 +21,4 @@ struct memdimm; void memdb_trigger(char *msg, struct memdimm *md, time_t t, struct err_type *et, struct bucket_conf *bc); -struct memdimm *get_memdimm(int socketid, int channel, int dimm); +struct memdimm *get_memdimm(int socketid, int channel, int dimm, int insert); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/msr.c new/mcelog-1.1/msr.c --- old/mcelog-1.0pre3.6e4e2a000124/msr.c 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.1/msr.c 2013-09-20 16:33:19.000000000 +0200 @@ -0,0 +1,61 @@ +#include "mcelog.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +static void domsr(int cpu, int msr, int bit) +{ + char fpath[32]; + unsigned long long data; + int fd; + + sprintf(fpath, "/dev/cpu/%d/msr", cpu); + fd = open(fpath, O_RDWR); + if (fd == -1) { + switch (errno) { + case ENOENT: + SYSERRprintf("Warning: cpu %d offline?, imc_log not set\n", cpu); + return; + default: + SYSERRprintf("Cannot open %s to set imc_log\n", fpath); + exit(1); + } + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot read MSR_ERROR_CONTROL from %s\n", fpath); + exit(1); + } + data |= bit; + if (pwrite(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot write MSR_ERROR_CONTROL to %s\n", fpath); + exit(1); + } + if (pread(fd, &data, sizeof data, msr) != sizeof data) { + SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); + exit(1); + } + if ((data & bit) == 0) { + SYSERRprintf("Failed to set imc_log on cpu %d\n", cpu); + exit(1); + } + close(fd); +} + +void set_imc_log(int cputype) +{ + int cpu, ncpus = sysconf(_SC_NPROCESSORS_CONF); + int msr, bit; + + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; + } + + for (cpu = 0; cpu < ncpus; cpu++) + domsr(cpu, msr, bit); +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/p4.c new/mcelog-1.1/p4.c --- old/mcelog-1.0pre3.6e4e2a000124/p4.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/p4.c 2013-09-20 16:33:19.000000000 +0200 @@ -30,15 +30,12 @@ #include "tulsa.h" #include "intel.h" #include "yellow.h" +#include "bitfield.h" #include "sandy-bridge.h" +#include "ivy-bridge.h" /* decode mce for P4/Xeon and Core2 family */ -static inline int test_prefix(int nr, __u32 value) -{ - return ((value >> nr) == 1); -} - static char* get_TT_str(__u8 t) { static char* TT[] = {"Instruction", "Data", "Generic", "Unknown"}; @@ -149,6 +146,7 @@ [2] = "Microcode ROM parity error", [3] = "External error", [4] = "FRC error", + [5] = "Internal parity error", }; if (mca & (1UL << 12)) { @@ -356,8 +354,28 @@ break; case CPU_SANDY_BRIDGE: case CPU_SANDY_BRIDGE_EP: - snb_decode_model(cputype, log->bank, log->status, size); + snb_decode_model(cputype, log->bank, log->status, log->misc); break; + case CPU_IVY_BRIDGE_EPEX: + ivb_decode_model(cputype, log->bank, log->status, log->misc); + break; + } + + /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values + * and MISCV set. MISC register points to root port that reported the error + * need to cross check with AER logs for more details. + * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html + */ + if ((log->status & MCI_STATUS_MISCV) && + (log->status & 0xefff) == 0x0e0b) { + int seg, bus, dev, fn; + + seg = EXTRACT(log->misc, 32, 39); + bus = EXTRACT(log->misc, 24, 31); + dev = EXTRACT(log->misc, 19, 23); + fn = EXTRACT(log->misc, 16, 18); + Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", + seg, bus, dev, fn); } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/page.c new/mcelog-1.1/page.c --- old/mcelog-1.0pre3.6e4e2a000124/page.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/page.c 2013-09-20 16:33:19.000000000 +0200 @@ -171,12 +171,30 @@ u64 addr = m->addr; struct mempage *mp; time_t t; + unsigned cpu = m->extcpu ? m->extcpu : m->cpu; if (offline == OFFLINE_OFF) return; if (!(m->status & MCI_STATUS_ADDRV) || (m->status & MCI_STATUS_UC)) return; + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + /* + * On SNB-EP platform we see corrected errors reported with + * address in Bank 5 from hardware (depending on BIOS setting), + * in the meanwhile, a duplicate record constructed from + * information found by "firmware first" APEI code. Ignore the + * duplicate information so that we don't double count errors. + * + * NOTE: the record from APEI fake this error from CPU 0 BANK 1. + */ + if (m->bank == 1 && cpu == 0) + return; + default: + break; + } + t = m->time; addr &= ~((u64)PAGE_SIZE - 1); mp = mempage_lookup(addr); @@ -195,7 +213,7 @@ return; /* Only do triggers and messages for online pages */ thresh = bucket_output(&page_trigger_conf, &mp->ce.bucket); - md = get_memdimm(m->socketid, channel, dimm); + md = get_memdimm(m->socketid, channel, dimm, 1); asprintf(&msg, "Corrected memory errors on page %llx exceed threshold %s", addr, thresh); free(thresh); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/sandy-bridge.c new/mcelog-1.1/sandy-bridge.c --- old/mcelog-1.0pre3.6e4e2a000124/sandy-bridge.c 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/sandy-bridge.c 2013-09-20 16:33:19.000000000 +0200 @@ -21,8 +21,9 @@ #include "mcelog.h" #include "bitfield.h" #include "sandy-bridge.h" +#include "memdb.h" -/* See IA32 SDM Vol3B Appendix E.4 ff */ +/* See IA32 SDM Vol3B Table 16.4.1 */ static char *pcu_1[] = { [0] = "No error", @@ -39,17 +40,21 @@ static char *pcu_2[] = { [0x00] = "No Error", - [0x20] = "MC_RCLK_PLL_LOCK_TIMEOUT", - [0x21] = "MC_PCIE_PLL_LOCK_TIMEOT", - [0x22] = "MC_BOOT_VID_SET_TIMEOUT", - [0x23] = "MC_BOOT_FREQUENCY_SET_TIMEOUT", - [0x24] = "MC_START_IA_CORES_TIMEOUT", - [0x26] = "MC_PCIE_RCOMP_TIMEOUT", - [0x27] = "MC_PMA_DNS_COMMAND_TIMEOUT", - [0x28] = "MC_MESSAGE_CHANNEL_TIMEOUT", - [0x29] = "MC_GVFSM_BGF_PROGRAM_TIMEOUT", - [0x2A] = "MC_MC_PLL_LOCK_TIMEOUT", - [0x2B] = "MC_MS_BGF_PROGRAM_TIMEOUT", + [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", + [0x0E] = "MC_MC_CPD_UNCPD_ST_TIMEOUT", + [0x0F] = "MC_PKGS_SAFE_WP_TIMEOUT", + [0x43] = "MC_PECI_MAILBOX_QUIESCE_TIMEOUT", + [0x5C] = "MC_MORE_THAN_ONE_LT_AGENT", + [0x60] = "MC_INVALID_PKGS_REQ_PCH", + [0x61] = "MC_INVALID_PKGS_REQ_QPI", + [0x62] = "MC_INVALID_PKGS_RES_QPI", + [0x63] = "MC_INVALID_PKGC_RES_PCH", + [0x64] = "MC_INVALID_PKG_STATE_CONFIG", + [0x70] = "MC_WATCHDG_TIMEOUT_PKGC_SLAVE", + [0x71] = "MC_WATCHDG_TIMEOUT_PKGC_MASTER", + [0x72] = "MC_WATCHDG_TIMEOUT_PKGS_MASTER", + [0x7A] = "MC_HA_FAILSTS_CHANGE_DETECTED", + [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT", }; static struct field pcu_mc4[] = { @@ -58,6 +63,21 @@ {} }; +static char *memctrl_1[] = { + [0x001] = "Address parity error", + [0x002] = "HA Wrt buffer Data parity error", + [0x004] = "HA Wrt byte enable parity error", + [0x008] = "Corrected patrol scrub error", + [0x010] = "Uncorrected patrol scrub error", + [0x020] = "Corrected spare error", + [0x040] = "Uncorrected spare error", +}; + +static struct field memctrl_mc8[] = { + FIELD(16, memctrl_1), + {} +}; + void snb_decode_model(int cputype, int bank, u64 status, u64 misc) { switch (bank) { @@ -73,5 +93,71 @@ Wprintf("QPI\n"); } break; + case 8: + case 9: + case 10: + case 11: + Wprintf("MemCtrl: "); + decode_bitfield(status, memctrl_mc8); + Wprintf("\n"); + break; + } +} + +/* + * Sandy Bridge EP and EP4S processors (family 6, model 45) support additional + * logging for corrected errors in the integrated memory controller (IMC) + * banks. The mode is off by default, but can be enabled by setting the + * "MemError Log Enable" * bit in MSR_ERROR_CONTROL (MSR 0x17f). + * The documentation in the August 2012 edition of Intel's Software developer + * manual has some minor errors because the worng version of table 16-16 + * "Intel IMC MC Error Codes for IA32_MCi_MISC (i= 8, 11)" was included. + * Corrections are: + * Bit 62 is the "VALID" bit for the "first-device" bits in MISC and STATUS + * Bit 63 is the "VALID" bit for the "second-device" bits in MISC + * Bits 58:56 and 61:59 should be marked as "reserved". + * There should also be a footnote explaining how the "failing rank" fields + * can be converted to a DIMM number within a channel for systems with either + * two or three DIMMs per channel. + */ +static int failrank2dimm(unsigned failrank, int socket, int channel) +{ + switch (failrank) { + case 0: case 1: case 2: case 3: + return 0; + case 4: case 5: + return 1; + case 6: case 7: + if (get_memdimm(socket, channel, 2, 0)) + return 2; + else + return 1; + } + return -1; +} + +void sandy_bridge_ep_memerr_misc(struct mce *m, int *channel, int *dimm) +{ + u64 status = m->status; + unsigned failrank, chan; + + /* Ignore unless this is an corrected extended error from an iMC bank */ + if (!imc_log || m->bank < 8 || m->bank > 11 || (status & MCI_STATUS_UC) || + !test_prefix(7, status & 0xefff)) + return; + + chan = EXTRACT(status, 0, 3); + if (chan == 0xf) + return; + + if (EXTRACT(m->misc, 62, 62)) { + failrank = EXTRACT(m->misc, 46, 50); + dimm[0] = failrank2dimm(failrank, m->socketid, chan); + channel[0] = chan; + } + if (EXTRACT(m->misc, 63, 63)) { + failrank = EXTRACT(m->misc, 51, 55); + dimm[1] = failrank2dimm(failrank, m->socketid, chan); + channel[1] = chan; } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/sandy-bridge.h new/mcelog-1.1/sandy-bridge.h --- old/mcelog-1.0pre3.6e4e2a000124/sandy-bridge.h 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/sandy-bridge.h 2013-09-20 16:33:19.000000000 +0200 @@ -1 +1,2 @@ void snb_decode_model(int cputype, int bank, u64 status, u64 misc); +void sandy_bridge_ep_memerr_misc(struct mce *m, int *channel, int *dimm); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0pre3.6e4e2a000124/tests/Makefile new/mcelog-1.1/tests/Makefile --- old/mcelog-1.0pre3.6e4e2a000124/tests/Makefile 2011-08-18 00:01:26.000000000 +0200 +++ new/mcelog-1.1/tests/Makefile 2013-09-20 16:33:19.000000000 +0200 @@ -11,4 +11,4 @@ clean: rm -f */*log - rm -f */results + rm -f */results* -- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
