Hello community, here is the log from the commit of package mcelog for openSUSE:Factory checked in at 2015-07-05 17:57:37 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/mcelog (Old) and /work/SRC/openSUSE:Factory/.mcelog.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "mcelog" Changes: -------- --- /work/SRC/openSUSE:Factory/mcelog/mcelog.changes 2015-01-25 21:14:02.000000000 +0100 +++ /work/SRC/openSUSE:Factory/.mcelog.new/mcelog.changes 2015-07-05 17:57:38.000000000 +0200 @@ -1,0 +2,12 @@ +Mon Jun 15 16:18:55 UTC 2015 - [email protected] + +- Update to latest v120 git tag and name the version 1.20: + New supported CPUs: + - Add model number for Broadwell-DE + - Added Knights Landing (Xeon Phi) + - Add all current Atom cpuids + - Support Broadwell-U + - New manpages: mcelog.conf.5 and mcelog.triggers.5 + And quite some undocumented bugfixes, see git log for details + +------------------------------------------------------------------- Old: ---- mcelog-1.0.8.tar.bz2 New: ---- mcelog-1.20.tar.bz2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ mcelog.spec ++++++ --- /var/tmp/diff_new_pack.sdK5G2/_old 2015-07-05 17:57:39.000000000 +0200 +++ /var/tmp/diff_new_pack.sdK5G2/_new 2015-07-05 17:57:39.000000000 +0200 @@ -1,7 +1,7 @@ # # spec file for package mcelog # -# Copyright (c) 2015 SUSE LINUX Products GmbH, Nuernberg, Germany. +# Copyright (c) 2015 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -20,7 +20,7 @@ Summary: Log Machine Check Events License: GPL-2.0 Group: System/Monitoring -Version: 1.0.8 +Version: 1.20 Release: 0 ExclusiveArch: %{ix86} x86_64 BuildRequires: libesmtp-devel @@ -45,6 +45,7 @@ Patch10: patches/add-f16h-support.patch Patch11: mcelog-socket-path.patch Patch12: fix_setgroups_missing_call.patch + BuildRoot: %{_tmppath}/%{name}-%{version}-build PreReq: %fillup_prereq Url: https://git.kernel.org/cgit/utils/cpu/mce/mcelog.git @@ -82,6 +83,7 @@ %patch10 -p1 %patch11 -p1 %patch12 -p1 + %build export SUSE_ASNEEDED=0 make CFLAGS="$RPM_OPT_FLAGS" @@ -126,6 +128,7 @@ %files %defattr (-,root,root,755) %{_mandir}/man8/* +%{_mandir}/man5/* /usr/sbin/mcelog %config /etc/logrotate.d/mcelog %dir /etc/mcelog @@ -137,5 +140,6 @@ %{_tmpfilesdir}/mcelog.conf %_docdir/%name %_sbindir/rcmcelog +%ghost /run/mcelog %changelog ++++++ Start-consolidating-AMD-specific-stuff.patch ++++++ --- /var/tmp/diff_new_pack.sdK5G2/_old 2015-07-05 17:57:39.000000000 +0200 +++ /var/tmp/diff_new_pack.sdK5G2/_new 2015-07-05 17:57:39.000000000 +0200 @@ -16,11 +16,11 @@ rename k8.c => amd.c (97%) rename k8.h => amd.h (79%) -Index: mcelog-1.0.1/Makefile +Index: mcelog-1.20/Makefile =================================================================== ---- mcelog-1.0.1.orig/Makefile -+++ mcelog-1.0.1/Makefile -@@ -29,7 +29,7 @@ all: mcelog +--- mcelog-1.20.orig/Makefile 2015-06-15 15:15:14.281052761 +0200 ++++ mcelog-1.20/Makefile 2015-06-15 15:15:52.523239254 +0200 +@@ -33,7 +33,7 @@ .PHONY: install clean depend @@ -29,10 +29,10 @@ nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \ eventloop.o leaky-bucket.o memdb.o server.o trigger.o \ client.o cache.o sysfs.o yellow.o page.o rbtree.o \ -Index: mcelog-1.0.1/k8.c +Index: mcelog-1.20/k8.c =================================================================== ---- mcelog-1.0.1.orig/k8.c -+++ /dev/null +--- mcelog-1.20.orig/k8.c 2015-06-15 14:25:23.000000000 +0200 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,281 +0,0 @@ -/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and - * Eric Morton. Hacked and extended for mcelog by AK. @@ -315,10 +315,10 @@ - } - return 1; -} -Index: mcelog-1.0.1/amd.c +Index: mcelog-1.20/amd.c =================================================================== ---- /dev/null -+++ mcelog-1.0.1/amd.c +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ mcelog-1.20/amd.c 2015-06-15 15:15:52.531239713 +0200 @@ -0,0 +1,282 @@ +/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and + * Eric Morton. Hacked and extended for mcelog by AK. @@ -602,10 +602,10 @@ + } + return 1; +} -Index: mcelog-1.0.1/k8.h +Index: mcelog-1.20/k8.h =================================================================== ---- mcelog-1.0.1.orig/k8.h -+++ /dev/null +--- mcelog-1.20.orig/k8.h 2015-06-15 14:25:23.000000000 +0200 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,11 +0,0 @@ -char *k8_bank_name(unsigned num); -void decode_k8_mc(struct mce *mce, int *ismemerr); @@ -618,10 +618,10 @@ -#define K8_MCELOG_THRESHOLD_LINK (4 * 9 + 1) -#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2) -#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3) -Index: mcelog-1.0.1/amd.h +Index: mcelog-1.20/amd.h =================================================================== ---- /dev/null -+++ mcelog-1.0.1/amd.h +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ mcelog-1.20/amd.h 2015-06-15 15:15:52.539240159 +0200 @@ -0,0 +1,14 @@ +char *k8_bank_name(unsigned num); +void decode_amd_mc(enum cputype, struct mce *mce, int *ismemerr); @@ -637,10 +637,10 @@ + +#define CASE_AMD_CPUS \ + case CPU_K8 -Index: mcelog-1.0.1/mcelog.c +Index: mcelog-1.20/mcelog.c =================================================================== ---- mcelog-1.0.1.orig/mcelog.c -+++ mcelog-1.0.1/mcelog.c +--- mcelog-1.20.orig/mcelog.c 2015-06-15 15:15:14.293053554 +0200 ++++ mcelog-1.20/mcelog.c 2015-06-15 15:15:52.543240384 +0200 @@ -41,7 +41,7 @@ #include <fnmatch.h> #include "mcelog.h" @@ -650,7 +650,7 @@ #include "intel.h" #include "p4.h" #include "dmi.h" -@@ -397,9 +397,9 @@ static void dump_mce(struct mce *m, unsi +@@ -410,9 +410,9 @@ time_t t = m->time; Wprintf("TIME %llu %s", m->time, ctime(&t)); } ++++++ add-f10h-support.patch ++++++ --- /var/tmp/diff_new_pack.sdK5G2/_old 2015-07-05 17:57:39.000000000 +0200 +++ /var/tmp/diff_new_pack.sdK5G2/_new 2015-07-05 17:57:39.000000000 +0200 @@ -1,10 +1,10 @@ Add F10h decoding support Signed-off-by: Borislav Petkov <[email protected]> -Index: mcelog-1.0.1/amd.c +Index: mcelog-1.20/amd.c =================================================================== ---- mcelog-1.0.1.orig/amd.c -+++ mcelog-1.0.1/amd.c +--- mcelog-1.20.orig/amd.c 2015-06-15 15:16:07.712107628 +0200 ++++ mcelog-1.20/amd.c 2015-06-15 15:16:14.564499430 +0200 @@ -14,7 +14,7 @@ #include "mcelog.h" #include "amd.h" @@ -14,7 +14,7 @@ "data cache", "instruction cache", "bus unit", -@@ -22,28 +22,34 @@ static char *k8bank[] = { +@@ -22,28 +22,34 @@ "northbridge", "fixed-issue reoder" }; @@ -58,7 +58,7 @@ static char *nbextendederr[] = { "RAM ECC error", "CRC error", -@@ -65,6 +71,46 @@ static char *nbextendederr[] = { +@@ -65,6 +71,46 @@ "L3 Cache Tag Error", "L3 Cache LRU Error" }; @@ -105,7 +105,7 @@ static char *highbits[32] = { [31] = "valid", [30] = "error overflow (multiple errors)", -@@ -100,6 +146,21 @@ static char *k8threshold[] = { +@@ -100,6 +146,21 @@ "Unknown threshold counter", }; @@ -127,7 +127,7 @@ static void decode_k8_generic_errcode(u64 status) { -@@ -245,21 +306,393 @@ static decoder_t decoders[] = { +@@ -245,21 +306,393 @@ [5] = decode_k8_fr_mc, }; @@ -529,7 +529,7 @@ if (num < NELE(k8bank)) s = k8bank[num]; else if (num >= K8_MCE_THRESHOLD_BASE && -@@ -270,13 +703,16 @@ char *k8_bank_name(unsigned num) +@@ -270,13 +703,16 @@ return buf; } @@ -554,10 +554,10 @@ + } + return 1; } -Index: mcelog-1.0.1/amd.h +Index: mcelog-1.20/amd.h =================================================================== ---- mcelog-1.0.1.orig/amd.h -+++ mcelog-1.0.1/amd.h +--- mcelog-1.20.orig/amd.h 2015-06-15 15:16:07.728108543 +0200 ++++ mcelog-1.20/amd.h 2015-06-15 15:16:14.576500082 +0200 @@ -1,6 +1,25 @@ +#include <stdbool.h> + @@ -585,7 +585,7 @@ #define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ #define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9) -@@ -10,6 +29,8 @@ int mce_filter_k8(struct mce *m); +@@ -10,6 +29,8 @@ #define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2) #define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3) @@ -594,7 +594,7 @@ #define EC(x) ((x) & 0xffff) #define XEC(x, mask) (((x) >> 16) & mask) -@@ -22,23 +43,20 @@ int mce_filter_k8(struct mce *m); +@@ -22,23 +43,20 @@ #define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) #define TT(x) (((x) >> 2) & 0x3) @@ -624,7 +624,7 @@ enum tt_ids { TT_INSTR = 0, -@@ -72,3 +90,7 @@ enum rrrr_ids { +@@ -72,3 +90,7 @@ R4_EVICT, R4_SNOOP, }; @@ -632,11 +632,11 @@ +#define CASE_AMD_CPUS \ + case CPU_K8: \ + case CPU_F10H -Index: mcelog-1.0.1/mcelog.h +Index: mcelog-1.20/mcelog.h =================================================================== ---- mcelog-1.0.1.orig/mcelog.h -+++ mcelog-1.0.1/mcelog.h -@@ -107,6 +107,7 @@ enum cputype { +--- mcelog-1.20.orig/mcelog.h 2015-06-15 15:16:07.740109229 +0200 ++++ mcelog-1.20/mcelog.h 2015-06-15 15:16:14.580500307 +0200 +@@ -111,6 +111,7 @@ CPU_P6OLD, CPU_CORE2, /* 65nm and 45nm */ CPU_K8, @@ -644,11 +644,11 @@ CPU_P4, CPU_NEHALEM, CPU_DUNNINGTON, -Index: mcelog-1.0.1/mcelog.c +Index: mcelog-1.20/mcelog.c =================================================================== ---- mcelog-1.0.1.orig/mcelog.c -+++ mcelog-1.0.1/mcelog.c -@@ -142,19 +142,20 @@ static void resolveaddr(unsigned long ad +--- mcelog-1.20.orig/mcelog.c 2015-06-15 15:16:07.752109915 +0200 ++++ mcelog-1.20/mcelog.c 2015-06-15 15:19:18.771031150 +0200 +@@ -144,19 +144,20 @@ static int mce_filter(struct mce *m, unsigned recordlen) { @@ -673,7 +673,7 @@ } static void print_tsc(int cpunum, __u64 tsc, unsigned long time) -@@ -221,6 +222,7 @@ static char *cputype_name[] = { +@@ -223,6 +224,7 @@ [CPU_P6OLD] = "Intel PPro/P2/P3/old Xeon", [CPU_CORE2] = "Intel Core", /* 65nm and 45nm */ [CPU_K8] = "AMD K8 and derivates", @@ -681,7 +681,7 @@ [CPU_P4] = "Intel P4", [CPU_NEHALEM] = "Intel Xeon 5500 series / Core i3/5/7 (\"Nehalem/Westmere\")", [CPU_DUNNINGTON] = "Intel Xeon 7400 series", -@@ -239,6 +241,7 @@ static struct config_choice cpu_choices[ +@@ -245,6 +247,7 @@ { "p6old", CPU_P6OLD }, { "core2", CPU_CORE2 }, { "k8", CPU_K8 }, @@ -689,7 +689,7 @@ { "p4", CPU_P4 }, { "dunnington", CPU_DUNNINGTON }, { "xeon74xx", CPU_DUNNINGTON }, -@@ -330,15 +333,13 @@ static enum cputype setup_cpuid(u32 cpuv +@@ -343,15 +346,13 @@ parse_cpuid(cpuid, &family, &model); @@ -708,7 +708,7 @@ cpuvendor, family, model); return CPU_GENERIC; } -@@ -511,14 +512,9 @@ int is_cpu_supported(void) +@@ -526,14 +527,9 @@ } if (seen == ALL) { @@ -716,7 +716,7 @@ - if (family == 15) { - cputype = CPU_K8; - } else if (family >= 16) { -- SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family); +- SYSERRprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); - return 0; - } - } else if (!strcmp(vendor,"GenuineIntel")) ++++++ email.patch ++++++ --- /var/tmp/diff_new_pack.sdK5G2/_old 2015-07-05 17:57:39.000000000 +0200 +++ /var/tmp/diff_new_pack.sdK5G2/_new 2015-07-05 17:57:39.000000000 +0200 @@ -7,16 +7,16 @@ msg.c | 8 ++ 6 files changed, 343 insertions(+), 2 deletions(-) -Index: mcelog-1.0.8/Makefile +Index: mcelog-1.20/Makefile =================================================================== ---- mcelog-1.0.8.orig/Makefile 2014-12-20 19:35:05.000000000 +0100 -+++ mcelog-1.0.8/Makefile 2015-01-22 14:55:13.323708502 +0100 +--- mcelog-1.20.orig/Makefile 2015-06-15 14:25:23.000000000 +0200 ++++ mcelog-1.20/Makefile 2015-06-15 15:15:14.281052761 +0200 @@ -1,3 +1,4 @@ +CONFIG_EMAIL := 1 CFLAGS := -g -Os prefix := /usr etcprefix := -@@ -37,7 +38,8 @@ +@@ -38,7 +39,8 @@ client.o cache.o sysfs.o yellow.o page.o rbtree.o \ xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o bus.o unknown.o DISKDB_OBJ := diskdb.o dimm.o db.o @@ -26,7 +26,7 @@ DOC := mce.pdf ADD_DEFINES := -@@ -49,6 +51,12 @@ +@@ -50,6 +52,12 @@ all: dbquery endif @@ -39,10 +39,10 @@ SRC := $(OBJ:.o=.c) mcelog: ${OBJ} -Index: mcelog-1.0.8/email.c +Index: mcelog-1.20/email.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ mcelog-1.0.8/email.c 2015-01-22 14:55:13.327708502 +0100 ++++ mcelog-1.20/email.c 2015-06-15 15:15:14.285053019 +0200 @@ -0,0 +1,199 @@ +#include <unistd.h> +#include <signal.h> @@ -243,10 +243,10 @@ + smtp_destroy_session (session); + return 0; +} -Index: mcelog-1.0.8/email.h +Index: mcelog-1.20/email.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ mcelog-1.0.8/email.h 2015-01-22 14:55:13.331708502 +0100 ++++ mcelog-1.20/email.h 2015-06-15 15:15:14.289053315 +0200 @@ -0,0 +1,34 @@ +#ifndef _MCELOG_EMAIL_H_ +#define _MCELOG_EMAIL_H_ @@ -282,10 +282,10 @@ +#endif + +#endif -Index: mcelog-1.0.8/mcelog.c +Index: mcelog-1.20/mcelog.c =================================================================== ---- mcelog-1.0.8.orig/mcelog.c 2014-12-20 19:35:05.000000000 +0100 -+++ mcelog-1.0.8/mcelog.c 2015-01-22 14:55:13.335708502 +0100 +--- mcelog-1.20.orig/mcelog.c 2015-06-15 14:25:23.000000000 +0200 ++++ mcelog-1.20/mcelog.c 2015-06-15 15:15:14.293053554 +0200 @@ -37,6 +37,7 @@ #include <assert.h> #include <signal.h> @@ -313,7 +313,7 @@ static char *inputfile; char *processor_flags; static int foreground; -@@ -944,6 +948,7 @@ +@@ -949,6 +953,7 @@ "--pidfile file Write pid of daemon into file\n" "--no-imc-log Disable extended iMC logging\n" ); @@ -321,7 +321,7 @@ diskdb_usage(); print_cputypes(); exit(1); -@@ -1011,6 +1016,7 @@ +@@ -1016,6 +1021,7 @@ { "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */ { "no-imc-log", 0, NULL, O_NO_IMC_LOG }, DISKDB_OPTIONS @@ -329,7 +329,7 @@ {} }; -@@ -1188,11 +1194,86 @@ +@@ -1193,11 +1199,86 @@ } } @@ -416,7 +416,7 @@ if (recordlen == 0) { Wprintf("no data in mce record\n"); -@@ -1219,12 +1300,16 @@ +@@ -1224,12 +1305,16 @@ finish = 1; if (!mce_filter(mce, recordlen)) continue; @@ -433,7 +433,7 @@ flushlog(); } -@@ -1335,6 +1420,8 @@ +@@ -1340,6 +1425,8 @@ exit(0); } else if (diskdb_cmd(opt, ac, av)) { exit(0); @@ -442,7 +442,7 @@ } else if (opt == 0) break; } -@@ -1363,6 +1450,10 @@ +@@ -1368,6 +1455,10 @@ logfn = av[optind++]; if (av[optind]) usage(); @@ -453,11 +453,11 @@ checkdmi(); general_setup(); -Index: mcelog-1.0.8/mcelog.h +Index: mcelog-1.20/mcelog.h =================================================================== ---- mcelog-1.0.8.orig/mcelog.h 2014-12-20 19:35:05.000000000 +0100 -+++ mcelog-1.0.8/mcelog.h 2015-01-22 14:55:13.339708502 +0100 -@@ -125,6 +125,7 @@ +--- mcelog-1.20.orig/mcelog.h 2015-06-15 14:25:23.000000000 +0200 ++++ mcelog-1.20/mcelog.h 2015-06-15 15:15:14.297053784 +0200 +@@ -131,6 +131,7 @@ enum option_ranges { O_COMMON = 500, O_DISKDB = 1000, @@ -465,10 +465,10 @@ }; enum syslog_opt { -Index: mcelog-1.0.8/msg.c +Index: mcelog-1.20/msg.c =================================================================== ---- mcelog-1.0.8.orig/msg.c 2014-12-20 19:35:05.000000000 +0100 -+++ mcelog-1.0.8/msg.c 2015-01-22 14:55:13.343708502 +0100 +--- mcelog-1.20.orig/msg.c 2015-06-15 14:25:23.000000000 +0200 ++++ mcelog-1.20/msg.c 2015-06-15 15:15:14.301053996 +0200 @@ -8,10 +8,13 @@ #include "mcelog.h" #include "msg.h" ++++++ mcelog-1.0.8.tar.bz2 -> mcelog-1.20.tar.bz2 ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/CHANGES new/mcelog-1.20/CHANGES --- old/mcelog-1.0.8/CHANGES 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/CHANGES 2015-06-15 14:25:23.000000000 +0200 @@ -1,5 +1,9 @@ <newer changes first> +Changes file is obsolete. +Please see git log on https://git.kernel.org/cgit/utils/cpu/mce/mcelog.git/ +for newer changes. + Add Linux Kongress 2010 paper Add Sandy Bridge Support Write pid file by default in daemon mode diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/Makefile new/mcelog-1.20/Makefile --- old/mcelog-1.0.8/Makefile 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/Makefile 2015-06-15 14:25:23.000000000 +0200 @@ -1,6 +1,7 @@ CFLAGS := -g -Os prefix := /usr etcprefix := +MANDIR := ${prefix}/share/man # Define appropiately for your distribution # DOCDIR := /usr/share/doc/packages/mcelog @@ -54,21 +55,27 @@ mcelog: ${OBJ} # dbquery intentionally not installed by default -install: mcelog - mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)${prefix}/share/man/man8 +install: mcelog mcelog.conf mcelog.conf.5 mcelog.triggers.5 + mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)$(MANDIR)/man5 $(DESTDIR)$(MANDIR)/man8 install -m 755 -p mcelog $(DESTDIR)${prefix}/sbin/mcelog - install -m 644 -p mcelog.8 $(DESTDIR)${prefix}/share/man/man8 + install -m 644 -p mcelog.8 $(DESTDIR)$(MANDIR)/man8 + install -m 644 -p mcelog.conf.5 $(DESTDIR)$(MANDIR)/man5 + install -m 644 -p mcelog.triggers.5 $(DESTDIR)$(MANDIR)/man5 install -m 644 -p -b mcelog.conf $(DESTDIR)${etcprefix}/etc/mcelog/mcelog.conf for i in ${TRIGGERS} ; do \ install -m 755 -p -b triggers/$$i $(DESTDIR)${etcprefix}/etc/mcelog ; \ done ifdef DOCDIR + install -d 755 $(DESTDIR)${DOCDIR} install -m 644 -p ${DOC} $(DESTDIR)${DOCDIR} else echo echo "Consider defining DOCDIR to install additional documentation" endif +mcelog.conf.5: mcelog.conf config-intro.man + ./genconfig.py mcelog.conf config-intro.man > mcelog.conf.5 + clean: test-clean rm -f ${CLEAN} ${OBJ} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/README new/mcelog-1.20/README --- old/mcelog-1.0.8/README 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/README 2015-06-15 14:25:23.000000000 +0200 @@ -2,11 +2,15 @@ reported by the hardware to the kernel. The kernel does the immediate actions (like killing processes etc.) and mcelog decodes the errors and manages various other advanced error responses like -offlining memory, CPUs or triggering events. +offlining memory, CPUs or triggering events. In addition +mcelog also handles corrected errors, by logging and accounting them. It primarily handles machine checks and thermal events, which are reported for errors detected by the CPU. +For more details on what mcelog can do and the underlying theory +see http://www.mcelog.org + It is recommended that mcelog runs on all x86 machines, both 64bit (since early 2.6) and 32bit (since 2.6.32) @@ -40,6 +44,11 @@ For distributors: +You can run mcelog from systemd or similar daemons. An example +systemd unit file is in mcelog.service. + +For older distributions using init scripts: + Please install a init script by default that runs mcelog in daemon mode. The mcelog.init script is a good starting point. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/config-intro.man new/mcelog-1.20/config-intro.man --- old/mcelog-1.0.8/config-intro.man 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.20/config-intro.man 2015-06-15 14:25:23.000000000 +0200 @@ -0,0 +1,10 @@ +.SH NAME +mcelog.conf \- mcelog.conf reference +.SH SYNOPSIS +.B /etc/mcelog.conf +.SH DESCRIPTION + +/etc/mcelog.conf is the main configuration file for +.B mcelog(8). +This is configuration file separated into sections including +a default section. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/dmi.c new/mcelog-1.20/dmi.c --- old/mcelog-1.0.8/dmi.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/dmi.c 2015-06-15 14:25:23.000000000 +0200 @@ -162,6 +162,8 @@ check_symbol: while ((fgets(linebuf, sizeof(linebuf) - 1, efi_systab)) != NULL) { char *addrp = strchr(linebuf, '='); + if (!addrp) + break; *(addrp++) = '\0'; if (strcmp(linebuf, "SMBIOS") == 0) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/genconfig.py new/mcelog-1.20/genconfig.py --- old/mcelog-1.0.8/genconfig.py 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.20/genconfig.py 2015-06-15 14:25:23.000000000 +0200 @@ -0,0 +1,80 @@ +#!/usr/bin/python +# generate man config documentation from mcelog.conf example +# genconfig.py mcelog.conf intro.html +import sys +import re +import string +import argparse + +ap = argparse.ArgumentParser(description="generate man config documentation from mcelog.conf example") +ap.add_argument('config', type=argparse.FileType('r'), help="mcelog example config file") +ap.add_argument('intro', type=argparse.FileType('r'), help="intro file") +args = ap.parse_args() + +def parse(f): + lineno = 1 + explanation = 0 + header = 1 + for line in f: + lineno += 1 + + # skip first comment + if header: + if not re.match('^#', line): + header = 0 + continue + + # explanation + m = re.match('^#\s(.*)', line) + if m: + explanation += 1 + s = m.group(1) + if explanation == 1: + s = string.capitalize(s) + print s + continue + + if explanation: + print ".PP" + explanation = 0 + + # empty line: new option + if re.match('\s+', line): + new_option() + continue + # group + m = re.match('\[(.*)\]', line) + if m: + start_group(m.group(1)) + continue + # config option + m = re.match('^(#?)([a-z-]+) = (.*)', line) + if m: + config_option(m.group(1), m.group(2), m.group(3)) + continue + print >>sys.stderr, "Unparseable line %d" % (lineno-1) + +def config_option(enabled, name, value): + print ".B %s = %s" % (name, value) + print ".PP" + +def start_group(name): + print ".SS \"The %s config section\"" % (name) + +def new_option(): + print ".PP" + + +print """ +.\" Auto generated mcelog.conf manpage. Do not edit. +.TH "mcelog.conf" 5 "mcelog" +""" + +print args.intro.read() +parse(args.config) +print """ +.SH SEE ALSO +.BR mcelog (8), +.BR mcelog.triggers (5) +.B http://www.mcelog.org +""" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/haswell.c new/mcelog-1.20/haswell.c --- old/mcelog-1.0.8/haswell.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/haswell.c 2015-06-15 14:25:23.000000000 +0200 @@ -1,5 +1,5 @@ /* Copyright (C) 2013 Intel Corporation - Decode Intel Ivy Bridge specific machine check errors. + Decode Intel Haswell specific machine check errors. mcelog is free software; you can redistribute it and/or modify it under the terms of the GNU General Public diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/intel.c new/mcelog-1.20/intel.c --- old/mcelog-1.0.8/intel.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/intel.c 2015-06-15 14:25:23.000000000 +0200 @@ -34,7 +34,8 @@ if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL || cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP || cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX || - cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL) + cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL || + cpu == CPU_KNIGHTS_LANDING) memory_error_support = 1; } @@ -72,8 +73,15 @@ return CPU_HASWELL; else if (model == 0x3f) return CPU_HASWELL_EPEX; - else if (model == 0x3d) + else if (model == 0x3d || model == 0x56) return CPU_BROADWELL; + else if (model == 0x57) + return CPU_KNIGHTS_LANDING; + else if (model == 0x1c || model == 0x26 || model == 0x27 || + model == 0x35 || model == 0x36 || model == 0x36 || + model == 0x37 || model == 0x4a || model == 0x4c || + model == 0x4d || model == 0x5a || model == 0x5d) + return CPU_ATOM; if (model > 0x1a) { Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n", model); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/intel.h new/mcelog-1.20/intel.h --- old/mcelog-1.0.8/intel.h 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/intel.h 2015-06-15 14:25:23.000000000 +0200 @@ -20,5 +20,6 @@ case CPU_IVY_BRIDGE_EPEX: \ case CPU_HASWELL: \ case CPU_HASWELL_EPEX: \ - case CPU_BROADWELL + case CPU_BROADWELL: \ + case CPU_KNIGHTS_LANDING diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/leaky-bucket.c new/mcelog-1.20/leaky-bucket.c --- old/mcelog-1.0.8/leaky-bucket.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/leaky-bucket.c 2015-06-15 14:25:23.000000000 +0200 @@ -25,7 +25,7 @@ return time(NULL); } -static void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, +void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, time_t now) { long diff; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/leaky-bucket.h new/mcelog-1.20/leaky-bucket.h --- old/mcelog-1.0.8/leaky-bucket.h 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/leaky-bucket.h 2015-06-15 14:25:23.000000000 +0200 @@ -27,5 +27,7 @@ int bucket_conf_init(struct bucket_conf *c, const char *rate); void bucket_init(struct leaky_bucket *b); time_t bucket_time(void); +void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b, + time_t now); #endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.8 new/mcelog-1.20/mcelog.8 --- old/mcelog-1.0.8/mcelog.8 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/mcelog.8 2015-06-15 14:25:23.000000000 +0200 @@ -1,5 +1,4 @@ -.\" disk db commented out for now because it's not usable enough -.TH MCELOG 8 "May 2009" "" "Linux's Administrator's Manual" +.TH MCELOG 8 "Mar 2015" "" "Linux's Administrator's Manual" .SH NAME mcelog \- Decode kernel machine check log on x86 machines .SH SYNOPSIS @@ -26,13 +25,16 @@ transfer errors on the front side bus or CPU interconnect or other internal errors. Possible causes can be cosmic radiation, instable power supplies, -cooling problems, broken hardware, or bad luck. +cooling problems, broken hardware, running systems out of specification, +or bad luck. Most errors can be corrected by the CPU by internal error correction mechanisms. Uncorrected errors cause machine check exceptions which -may panic the machine. +may kill processes or panic the machine. A small number of corrected +errors is usually not a cause for worry, but a large number can indicate +future failure. -When a corrected error happens the x86 kernel writes a record describing +When a corrected or recovered error happens the x86 kernel writes a record describing the MCE into a internal ring buffer available through the .I /dev/mcelog device @@ -43,7 +45,11 @@ on the standard output or optionally into the system log. Optionally it can also take more options like keeping statistics or -triggering shell scripts on specific events. +triggering shell scripts on specific events. By default mcelog +supports offlining memory pages with persistent corrected errors, +offlining CPU cores if they developed cache problems, +and otherwise logging specific events to the system log after +they crossed a threshold. The normal operating modi for mcelog are running as a regular cron job (traditional way, deprecated), @@ -112,12 +118,12 @@ With the .B \-\-dmi -option mcelog will look up the addresses reported in machine +option mcelog will look up the DIMMs reported in machine checks in the .I SMBIOS/DMI -tables of the BIOS. -This can sometimes tell you which DIMM or memory controller -has developed a problem. More often the information reported +tables of the BIOS and map the DIMMs to board identifiers. +This only works when the BIOS reports the identifiers correctly. +Unfortunately often the information reported by the BIOS is either subtly or obviously wrong or useless. This option requires that mcelog has read access to /dev/mem (normally requires root) and runs on the same machine @@ -281,6 +287,9 @@ use .I logfile = /tmp/logfile +For more information on the config file please see +.B mcelog.conf(5). + .SH NOTES The kernel prefers old messages over new. If the log buffer overflows only old ones will be kept. @@ -308,9 +317,14 @@ .\"/var/lib/memory-errors .SH SEE ALSO +.BR mcelog.conf(5), +.BR mcelog.triggers(5) + +http://www.mcelog.org + AMD x86-64 architecture programmer's manual, Volume 2, System programming Intel 64 and IA32 Architectures Software Developer's manual, Volume 3, System programming guide -Parts 1 and 2. Machine checks are described in Chapter 14 in Part1 and in Appendix E in Part2. +Chapter 15 and 16. http://www.intel.com/sdm Datasheet of your CPU. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.c new/mcelog-1.20/mcelog.c --- old/mcelog-1.0.8/mcelog.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/mcelog.c 2015-06-15 14:25:23.000000000 +0200 @@ -232,6 +232,8 @@ [CPU_HASWELL] = "Haswell", /* Fill in better name */ [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", [CPU_BROADWELL] = "Broadwell", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + [CPU_ATOM] = "ATOM", }; static struct config_choice cpu_choices[] = { @@ -271,8 +273,10 @@ { "haswell-ep", CPU_HASWELL_EPEX }, { "haswell-ex", CPU_HASWELL_EPEX }, { "broadwell", CPU_BROADWELL }, + { "knightslanding", CPU_KNIGHTS_LANDING }, { "xeon-v2", CPU_IVY_BRIDGE_EPEX }, { "xeon-v3", CPU_HASWELL_EPEX }, + { "atom", CPU_ATOM }, { NULL } }; @@ -434,7 +438,8 @@ mod); } if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX && - cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL) + cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL && + cputype != CPU_KNIGHTS_LANDING) resolveaddr(m->addr); if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) { diskdb_resolve_addr(m->addr); @@ -521,7 +526,7 @@ if (family == 15) { cputype = CPU_K8; } else if (family >= 16) { - SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family); + SYSERRprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family); return 0; } } else if (!strcmp(vendor,"GenuineIntel")) @@ -745,7 +750,7 @@ else s += 3; - n = sscanf(s, "%02x:<%016Lx> {%100s}%n", + n = sscanf(s, "%02x:<%016Lx> {%99s}%n", &cs, &m.ip, symbol, &next); @@ -1381,7 +1386,7 @@ d.buf = xalloc(d.recordlen * d.loglen); if (daemon_mode) { - prefill_memdb(); + prefill_memdb(do_dmi); if (!do_dmi) closedmi(); server_setup(); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.conf new/mcelog-1.20/mcelog.conf --- old/mcelog-1.0.8/mcelog.conf 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/mcelog.conf 2015-06-15 14:25:23.000000000 +0200 @@ -9,36 +9,36 @@ # white space is not allowed in value currently, except at the end where it is dropped # -# in general all command line options that are not commands work here -# see man mcelog or mcelog --help for a list +# In general all command line options that are not commands work here. +# See man mcelog or mcelog --help for a list. # e.g. to enable the --no-syslog option use #no-syslog = yes (or no to disable) # when the option has a argument #logfile = /tmp/logfile -# below are the options which are not command line options +# below are the options which are not command line options. # Set CPU type for which mcelog decodes events: #cpu = type -# for valid values for type please see mcelog --help +# For valid values for type please see mcelog --help. # If this value is set incorrectly the decoded output will be likely incorrect. -# by default when this parameter is not set mcelog uses the CPU it is running on +# By default when this parameter is not set mcelog uses the CPU it is running on # on very new kernels the mcelog events reported by the kernel also carry # the CPU type which is used too when available and not overriden. # Enable daemon mode: #daemon = yes # By default mcelog just processes the currently pending events and exits. -# in daemon mode it will keep running as a daemon in the background and poll +# In daemon mode it will keep running as a daemon in the background and poll # the kernel for events and then decode them. -# Filter out known broken events by default +# Filter out known broken events by default. filter = yes -# don't log memory errors individually -# they still get accounted if that is enabled +# Don't log memory errors individually. +# They still get accounted if that is enabled. #filter-memory-errors = yes # output in undecoded raw format to be easier machine readable -# (default is decoded) +# (default is decoded). #raw = yes # Set CPU Mhz to decode uptime from time stamp counter (output @@ -62,16 +62,17 @@ # Append log output to logfile instead of stdout. Only when no syslog logging is active #logfile = filename -# Use SMBIOS information to decode DIMMs (needs root) -# This function is not recommended to use right now and generally not needed +# Use SMBIOS information to decode DIMMs (needs root). +# This function is not recommended to use right now and generally not needed. # The exception is memdb prepopulation, which is configured separately below. #dmi = no -# when in daemon mode run as this user after set up -# note that the triggers will run as this user too -# setting this to non root will mean that triggers cannot take some corrective -# action, like offlining objects +# When in daemon mode run as this user after set up. +# Note that the triggers will run as this user too. +# Setting this to non root will mean that triggers cannot take some corrective +# action, like offlining objects. #run-credentials-user = root + # group to run as daemon with # default to the group of the run-credentials-user #run-credentials-group = nobody @@ -79,72 +80,88 @@ [server] # user allowed to access client socket. # when set to * match any -# root is always allowed to access +# root is always allowed to access. # default: root only client-user = root # group allowed to access mcelog -# when no group is configured any group matches (but still user checking) +# When no group is configured any group matches (but still user checking). # when set to * match any #client-group = root -# path to the unix socket for client<->server communication -# when no socket-path is configured the server will not start +# Path to the unix socket for client<->server communication. +# When no socket-path is configured the server will not start #socket-path = /var/run/mcelog-client -# when mcelog starts it checks if a server is already running. timeout +# When mcelog starts it checks if a server is already running. This configures the timeout # for this check. #initial-ping-timeout = 2 # [dimm] # Is the in memory DIMM error tracking enabled? # Only works on systems with integrated memory controller and -# which are supported -# Only takes effect in daemon mode +# which are supported. +# Only takes effect in daemon mode. dimm-tracking-enabled = yes -# Use DMI information from the BIOS to prepopulate DIMM database +# Use DMI information from the BIOS to prepopulate DIMM database. # Note this might not work with all BIOS and requires mcelog to run as root. # Alternative is to let mcelog create DIMM objects on demand. dmi-prepopulate = yes # -# execute these triggers when the rate of corrected or uncorrected -# errors per DIMM exceeds the threshold +# Execute these triggers when the rate of corrected or uncorrected +# Errors per DIMM exceeds the threshold. # Note when the hardware does not report DIMMs this might also -# be per channel +# be per channel. # The default of 10/24h is reasonable for server quality -# DDR3 DIMMs as of 2009/10 +# DDR3 DIMMs as of 2009/10. #uc-error-trigger = dimm-error-trigger uc-error-threshold = 1 / 24h #ce-error-trigger = dimm-error-trigger ce-error-threshold = 10 / 24h [socket] -# Memory error accounting per socket +# Enable memory error accounting per socket. socket-tracking-enabled = yes -# Threshold and trigger for uncorrected memory errors on a socket + +# Threshold and trigger for uncorrected memory errors on a socket. # mem-uc-error-trigger = socket-memory-error-trigger + mem-uc-error-threshold = 100 / 24h -# Threshold and trigger for corrected memory errors on a socket + +# Trigger script for corrected memory errors on a socket. mem-ce-error-trigger = socket-memory-error-trigger + +# Threshold on when to trigger a correct error for the socket. + mem-ce-error-threshold = 100 / 24h + # Log socket error threshold explicitely? mem-ce-error-log = yes +# Trigger script for uncorrected bus error events bus-uc-threshold-trigger = bus-error-trigger + +# Trigger script for uncorrected IOMCA erors iomca-threshold-trigger = iomca-error-trigger + +# Trigger script for other uncategorized errors unknown-threshold-trigger = unknown-error-trigger [cache] -# Processing of cache error thresholds reported by Intel CPUs +# Processing of cache error thresholds reported by Intel CPUs. cache-threshold-trigger = cache-error-trigger + # Should cache threshold events be logged explicitely? cache-threshold-log = yes [page] -# Memory error accouting per 4K memory page -# Threshold for the correct memory errors trigger script +# Memory error accouting per 4K memory page. +# Threshold for the correct memory errors trigger script. memory-ce-threshold = 10 / 24h -# Trigger script for corrected errors + +# Trigger script for corrected errors. # memory-ce-trigger = page-error-trigger + # Should page threshold events be logged explicitely? memory-ce-log = yes + # specify the internal action in mcelog to exceeding a page error threshold # this is done in addition to executing the trigger script if available # off no action diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.conf.5 new/mcelog-1.20/mcelog.conf.5 --- old/mcelog-1.0.8/mcelog.conf.5 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.20/mcelog.conf.5 2015-06-15 14:25:23.000000000 +0200 @@ -0,0 +1,283 @@ + +." Auto generated mcelog.conf manpage. Do not edit. +.TH "mcelog.conf" 5 "mcelog" + +.SH NAME +mcelog.conf \- mcelog.conf reference +.SH SYNOPSIS +.B /etc/mcelog.conf +.SH DESCRIPTION + +/etc/mcelog.conf is the main configuration file for +.B mcelog(8). +This is configuration file separated into sections including +a default section. + + +General format +.PP +.B optionname = value +.PP +White space is not allowed in value currently, except at the end where it is dropped + +.PP +.PP +In general all command line options that are not commands work here. +See man mcelog or mcelog --help for a list. +e.g. to enable the --no-syslog option use +.PP +.B no-syslog = yes (or no to disable) +.PP +When the option has a argument +.PP +.B logfile = /tmp/logfile +.PP +Below are the options which are not command line options. +.PP +.PP +Set cpu type for which mcelog decodes events: +.PP +.B cpu = type +.PP +For valid values for type please see mcelog --help. +If this value is set incorrectly the decoded output will be likely incorrect. +By default when this parameter is not set mcelog uses the CPU it is running on +on very new kernels the mcelog events reported by the kernel also carry +the CPU type which is used too when available and not overriden. +.PP +.PP +Enable daemon mode: +.PP +.B daemon = yes +.PP +By default mcelog just processes the currently pending events and exits. +In daemon mode it will keep running as a daemon in the background and poll +the kernel for events and then decode them. +.PP +.PP +Filter out known broken events by default. +.PP +.B filter = yes +.PP +Don't log memory errors individually. +They still get accounted if that is enabled. +.PP +.B filter-memory-errors = yes +.PP +.PP +Output in undecoded raw format to be easier machine readable +(default is decoded). +.PP +.B raw = yes +.PP +.PP +Set cpu mhz to decode uptime from time stamp counter (output +unreliable, not needed on new kernels which report the event time +directly. A lot of systems don't have a linear time stamp clock +and the output is wrong then. +Normally mcelog tries to figure out if it the TSC is reliable +and only uses the current frequency then. +Setting a frequency forces timestamp decoding. +This setting is obsolete with modern kernels which report the time +directly. +.PP +.B cpumhz = 1800.00 +.PP +.PP +Log output options +Log decoded machine checks in syslog (default stdout or syslog for daemon) +.PP +.B syslog = yes +.PP +Log decoded machine checks in syslog with error level +.PP +.B syslog-error = yes +.PP +Never log anything to syslog +.PP +.B no-syslog = yes +.PP +Append log output to logfile instead of stdout. only when no syslog logging is active +.PP +.B logfile = filename +.PP +.PP +Use smbios information to decode dimms (needs root). +This function is not recommended to use right now and generally not needed. +The exception is memdb prepopulation, which is configured separately below. +.PP +.B dmi = no +.PP +.PP +When in daemon mode run as this user after set up. +Note that the triggers will run as this user too. +Setting this to non root will mean that triggers cannot take some corrective +action, like offlining objects. +.PP +.B run-credentials-user = root +.PP +.PP +Group to run as daemon with +default to the group of the run-credentials-user +.PP +.B run-credentials-group = nobody +.PP +.PP +.SS "The server config section" +User allowed to access client socket. +when set to * match any +root is always allowed to access. +default: root only +.PP +.B client-user = root +.PP +Group allowed to access mcelog +When no group is configured any group matches (but still user checking). +when set to * match any +.PP +.B client-group = root +.PP +Path to the unix socket for client<->server communication. +When no socket-path is configured the server will not start +.PP +.B socket-path = /var/run/mcelog-client +.PP +When mcelog starts it checks if a server is already running. this configures the timeout +for this check. +.PP +.B initial-ping-timeout = 2 +.PP + +.PP +.SS "The dimm config section" +Is the in memory dimm error tracking enabled? +Only works on systems with integrated memory controller and +which are supported. +Only takes effect in daemon mode. +.PP +.B dimm-tracking-enabled = yes +.PP +Use dmi information from the bios to prepopulate dimm database. +Note this might not work with all BIOS and requires mcelog to run as root. +Alternative is to let mcelog create DIMM objects on demand. +.PP +.B dmi-prepopulate = yes +.PP + +Execute these triggers when the rate of corrected or uncorrected +Errors per DIMM exceeds the threshold. +Note when the hardware does not report DIMMs this might also +be per channel. +The default of 10/24h is reasonable for server quality +DDR3 DIMMs as of 2009/10. +.PP +.B uc-error-trigger = dimm-error-trigger +.PP +.B uc-error-threshold = 1 / 24h +.PP +.B ce-error-trigger = dimm-error-trigger +.PP +.B ce-error-threshold = 10 / 24h +.PP +.PP +.SS "The socket config section" +Enable memory error accounting per socket. +.PP +.B socket-tracking-enabled = yes +.PP +.PP +Threshold and trigger for uncorrected memory errors on a socket. +mem-uc-error-trigger = socket-memory-error-trigger +.PP +.PP +.B mem-uc-error-threshold = 100 / 24h +.PP +.PP +Trigger script for corrected memory errors on a socket. +.PP +.B mem-ce-error-trigger = socket-memory-error-trigger +.PP +.PP +Threshold on when to trigger a correct error for the socket. +.PP +.PP +.B mem-ce-error-threshold = 100 / 24h +.PP +.PP + log socket error threshold explicitely? +.PP +.B mem-ce-error-log = yes +.PP +.PP +Trigger script for uncorrected bus error events +.PP +.B bus-uc-threshold-trigger = bus-error-trigger +.PP +.PP +Trigger script for uncorrected iomca erors +.PP +.B iomca-threshold-trigger = iomca-error-trigger +.PP +.PP +Trigger script for other uncategorized errors +.PP +.B unknown-threshold-trigger = unknown-error-trigger +.PP +.PP +.SS "The cache config section" +Processing of cache error thresholds reported by intel cpus. +.PP +.B cache-threshold-trigger = cache-error-trigger +.PP +.PP +Should cache threshold events be logged explicitely? +.PP +.B cache-threshold-log = yes +.PP +.PP +.SS "The page config section" +Memory error accouting per 4k memory page. +Threshold for the correct memory errors trigger script. +.PP +.B memory-ce-threshold = 10 / 24h +.PP +.PP +Trigger script for corrected errors. +memory-ce-trigger = page-error-trigger +.PP +.PP +Should page threshold events be logged explicitely? +.PP +.B memory-ce-log = yes +.PP +.PP +Specify the internal action in mcelog to exceeding a page error threshold +this is done in addition to executing the trigger script if available +off no action +account only account errors +soft try to soft-offline page without killing any processes + This requires an uptodate kernel. Might not be successfull. +hard try to hard-offline page by killing processes + Requires an uptodate kernel. Might not be successfull. +soft-then-hard First try to soft offline, then try hard offlining +.PP +.B memory-ce-action = off|account|soft|hard|soft-then-hard +.PP +.B memory-ce-action = soft +.PP +.PP +.SS "The trigger config section" +Maximum number of running triggers +.PP +.B children-max = 2 +.PP +Execute triggers in this directory +.PP +.B directory = /etc/mcelog +.PP + +.SH SEE ALSO +.BR mcelog (8) +, +.B http://www.mcelog.org + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.h new/mcelog-1.20/mcelog.h --- old/mcelog-1.0.8/mcelog.h 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/mcelog.h 2015-06-15 14:25:23.000000000 +0200 @@ -65,14 +65,18 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* signalled */ #define MCI_STATUS_AR (1ULL<<55) /* action-required */ +#define MCI_STATUS_FWST (1ULL<<37) /* Firmware updated status indicator */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +#define MCG_STATUS_LMCES (1ULL<<3) /* local machine check signaled */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_TES_P (1ULL<<11) /* Yellow bit cache threshold supported */ #define MCG_SER_P (1ULL<<24) /* MCA recovery / new status */ +#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ #define NELE(x) (sizeof(x)/sizeof(*(x))) #define err(x) perror(x),exit(1) @@ -120,6 +124,8 @@ CPU_HASWELL, CPU_HASWELL_EPEX, CPU_BROADWELL, + CPU_KNIGHTS_LANDING, + CPU_ATOM, }; enum option_ranges { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.service new/mcelog-1.20/mcelog.service --- old/mcelog-1.0.8/mcelog.service 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.20/mcelog.service 2015-06-15 14:25:23.000000000 +0200 @@ -0,0 +1,10 @@ +[Unit] +Description=Machine Check Exception Logging Daemon +After=syslog.target + +[Service] +ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground +StandardOutput=syslog + +[Install] +WantedBy=multi-user.target diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/mcelog.triggers.5 new/mcelog-1.20/mcelog.triggers.5 --- old/mcelog-1.0.8/mcelog.triggers.5 1970-01-01 01:00:00.000000000 +0100 +++ new/mcelog-1.20/mcelog.triggers.5 2015-06-15 14:25:23.000000000 +0200 @@ -0,0 +1,231 @@ +'\" t +.TH "mcelog.triggers" 5 "mcelog" +.SH NAME +mcelog.triggers \- mcelog trigger scripts reference +.SH SYNOPSIS +.B /etc/mcelog/bus-error-trigger +.br +.B /etc/mcelog/cache-error-trigger +.br +.B /etc/mcelog/dimm-error-trigger +.br +.B /etc/mcelog/iomca-error-trigger +.br +.B /etc/mcelog/page-error-trigger +.br +.B /etc/mcelog/socket-memory-error-trigger +.br +.B /etc/mcelog/unknown-error-trigger +.br +.SH DESCRIPTION +.BR mcelog(8) +maintains thresholds of errors using a +.I leaky-bucket +algorithm. +When the number of errors in a specific +time window exceeds a pre-configured threshold a +.I trigger +will be executed. Triggers are usually shell scripts in the +.B /etc/mcelog +directory +but can be also other internal actions. Thresholds and triggers +can be configured in +.BR mcelog.conf(5) + +Trigger will run as the user configured for mcelog +in +.I mcelog.conf, +by default root. The default trigger action can +be overridden by specifying a different trigger script in the configuration file. +Actions in addition to the default trigger +(like notifying an administrator) can be put into the respective +.I /etc/mcelog/*.local +script which is executed after the default action. This allows updating the default +scripts without overriding local actions. All trigger actions are also +logged to syslog. +.PP +.B "The DIMM and socket memory error triggers" +.PP +The +.B /etc/mcelog/dimm-error-trigger +and +.B /etc/mcelog/socket-memory-error-trigger +scripts are executed when a DIMM or a CPU socket exceeds +a configured corrected or uncorrected memory error threshold. +The thresholds are configured in the +.B mcelog.conf +.I [dimm] +and +.I [socket] +sections. +The default triggers log a warning message in the system log. +The triggers are only executed when mcelog runs as a daemon. + +Arguments are passed as environment variables +.TS +tab(:); +l l. +THRESHOLD:human readable threshold status +MESSAGE:Human readable consolidated error message +TOTALCOUNT:total corrected or uncorrected count of errors for current DIMM depending on what triggered the event +LOCATION:Consolidated location as a single string +DMI_LOCATION:DIMM location from DMI/SMBIOS if available +DMI_NAME:DIMM identifier from DMI/SMBIOS if available +DIMM:DIMM number reported by hardware +CHANNEL:Channel number reported by hardware +SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM +CECOUNT:Total corrected error count for DIMM +UCCOUNT:Total uncorrected error count for DIMM +LASTEVENT:Time stamp of event that triggered threshold (in time_t format, seconds) +THRESHOLD_COUNT:Total umber of events in current threshold time period of specific type +.TE + +After the default action local actions in +.B /etc/mcelog/dimm-error-trigger.local +or respective +.B /etc/mcelog/socket-memory-error-trigger.local +are executed. + +.PP +.B "The page error trigger" +.PP +The +.B /etc/mcelog/page-error-trigger +script is +executed by mcelog in daemon mode when a page +in memory exceeds a pre-configured corrected or uncorrected error threshold. +mcelog internally also implements offlining the page through the kernel. +This is configured through the +.I [page] +section of +.BR mcelog.conf(5) +.PP +The environment arguments are the same as for the +.I dimm-error-trigger +script +.PP +After the default action local actions in +.I /etc/mcelog/page-error-trigger.loccal are executed. + +.PP +.B "The cache error trigger" +.PP +The +.I /etc/mcelog/cache-error-trigger +shell script is called for cache error handling in daemon mode +when a CPU reports excessive corrected cache errors. +This could be a indication for future uncorrected errors. +.PP +This trigger is configured through the +.B [cache] +section in the +.BR mcelog.conf(5) +configuration file. The threshold is defined by the CPU. The default trigger offlines the affected CPU cores, unless it is the last core running. +.PP +Arguments are passed as environment variables +.TS +tab(:); +l l. +MESSAGE:Human readable error message +CPU:Linux CPU number that triggered the error +LEVEL:Cache level affected by error +TYPE:Cache type affected by error (Data,Instruction,Generic) +AFFECTED_CPUS:List of CPUs sharing the affected cache +SOCKETID:Socket ID of affected CPU +.TE +.PP +After the default action local actions in +.I /etc/mcelog/cache-error-trigger.local are executed. +.PP +.B "The bus-uc-threshold-trigger" +.PP +The +.B bus-uc-threshold-trigger +runs on uncorrected errors on a IO bus. It is configured through the +.B bus-uc-threshold-trigger +and +.B bus-uc-threshold-trigger-threshold +options in +.I /etc/mcelog.conf(5). +By default it logs a message with the error location to the system log. +After the default action local actions in +.I /etc/mcelog/bus-uc-error-trigger.local +are executed. +.PP +Arguments are passed as environment variables +.TS +tab(:); +l l. +MESSAGE:Human readable consolidated error message. +LOCATION:Consolidated location as a single string +SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM +LEVEL:Interconnect level +PARTICIPATION:Processor Participation (Originator, Responder or Observer) +REQUEST:Request type (read, write, prefetch, etc.) +ORIGIN :Memory or IO +TIMEOUT:The request timed out or not +.TE +.PP +.B "The iomca-error-trigger" +.PP +The +.B iomca-error-trigger +runs when a socket receives bus or interconnect errors. +It is configured through the +.B iomca-error-trigger +and +.B iomca-error-trigger-threshold +options in +.I /etc/mcelog.conf. By default it logs a message with the error location to the system log. +After the default action local actions in +.I /etc/mcelog/iomca-error-trigger.local are executed. +.PP +Arguments are passed as environment variables +.TS +tab(:); +l l. +MESSAGE:Human readable consolidated error message +LOCATION:Consolidated location as a single string +SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM +CPU:Linux CPU number that triggered the error +SET:PCI segment number +BUS:PCI bus number +DEVICE:PCI device number +FUNCTION:PCI function number +.TE +.PP +.B "The unknown-error-trigger" +.PP +The +.B unknown-error-trigger +runs on any errors not otherwise categorized. +It is configured through the +.B unknown-error-trigger +and +.B unknown-error-trigger-threshold +options in +.I /etc/mcelog.conf. +By default it logs a message to the system log. +After the default action local actions in +.I /etc/mcelog/unknown-error-trigger.local +are executed. +.PP +Arguments are passed as environment variables +.TS +tab(:); +l l. +MESSAGE:Human readable consolidated error message +LOCATION:Consolidated location as a single string +SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM +CPU:Linux CPU number that triggered the error +STATUS:IA32_MCi_STATUS register value +ADDR:IA32_MCi_ADDR register value +MISC:IA32_MCi_MISC register value +MCGSTATUS:IA32_MCG_STATUS register value +MCGCAP:IA32_MCG_CAP register value +.TE +.SH SEE ALSO +http://www.mcelog.org + +.B mcelog(8), +.B mcelog.conf(5) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/memdb.c new/mcelog-1.20/memdb.c --- old/mcelog-1.0.8/memdb.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/memdb.c 2015-06-15 14:25:23.000000000 +0200 @@ -270,6 +270,7 @@ int all = (flags & DUMP_ALL); char *s; + bucket_age(bc, &e->bucket, bucket_time()); if (e->count || e->bucket.count || all) fprintf(f, "%s:\n", name); if (e->count || all) { @@ -382,7 +383,7 @@ } /* Prepopulate DIMM database from BIOS information */ -void prefill_memdb(void) +void prefill_memdb(int do_dmi) { static int initialized; int i; @@ -395,7 +396,7 @@ if (!memdb_enabled) return; initialized = 1; - if (config_bool("dimm", "dmi-prepopulate") == 0) + if (config_bool("dimm", "dmi-prepopulate") == 0 || !do_dmi) return; if (opendmi() < 0) return; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/memdb.h new/mcelog-1.20/memdb.h --- old/mcelog-1.0.8/memdb.h 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/memdb.h 2015-06-15 14:25:23.000000000 +0200 @@ -11,7 +11,7 @@ DUMP_BIOS = (1 << 1), }; -void prefill_memdb(void); +void prefill_memdb(int do_dmi); void memdb_config(void); void dump_memory_errors(FILE *f, enum printflags flags); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/p4.c new/mcelog-1.20/p4.c --- old/mcelog-1.0.8/p4.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/p4.c 2015-06-15 14:25:23.000000000 +0200 @@ -317,6 +317,10 @@ if (status & (MCI_STATUS_S|MCI_STATUS_AR)) Wprintf("%s\n", arstate[(status >> 55) & 3]); + if ((mcgcap & MCG_SER_P) && (status & MCI_STATUS_FWST)) { + Wprintf("Firmware may have updated this error\n"); + } + if ((mcgcap == 0 || (mcgcap & MCG_TES_P)) && !(status & MCI_STATUS_UC)) { track = (status >> 53) & 3; decode_tracking(track); @@ -334,6 +338,8 @@ Wprintf("EIPV "); if (mcgstatus & MCG_STATUS_MCIP) Wprintf("MCIP "); + if (mcgstatus & MCG_STATUS_LMCES) + Wprintf("LMCE "); Wprintf("\n"); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/server.c new/mcelog-1.20/server.c --- old/mcelog-1.0.8/server.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/server.c 2015-06-15 14:25:23.000000000 +0200 @@ -291,7 +291,7 @@ { struct sigaction oldsa; struct sigaction sa = { .sa_handler = ping_timeout }; - int ret = -1, n; + int ret, n; char buf[10]; int fd = socket(PF_UNIX, SOCK_STREAM, 0); if (fd < 0) @@ -299,6 +299,7 @@ sigaction(SIGALRM, &sa, &oldsa); if (sigsetjmp(ping_timeout_ctx, 1) == 0) { + ret = 0; alarm(initial_ping_timeout); if (connect(fd, un, sizeof(struct sockaddr_un)) < 0) goto cleanup; @@ -308,7 +309,8 @@ goto cleanup; if (n == 5 && !memcmp(buf, "pong\n", 5)) ret = 0; - } + } else + ret = -1; cleanup: sigaction(SIGALRM, &oldsa, NULL); alarm(0); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/tests/test new/mcelog-1.20/tests/test --- old/mcelog-1.0.8/tests/test 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/tests/test 2015-06-15 14:25:23.000000000 +0200 @@ -17,6 +17,8 @@ exit 1 fi +[ ! -f /dev/mce-inject ] && modprobe mce-inject + echo "++++++++++++ running $1 test +++++++++++++++++++" # disable trigger diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/trigger.c new/mcelog-1.20/trigger.c --- old/mcelog-1.0.8/trigger.c 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/trigger.c 2015-06-15 14:25:23.000000000 +0200 @@ -115,11 +115,18 @@ static void child_handler(int sig, siginfo_t *si, void *ctx) { int status; + pid_t pid; + if (waitpid(si->si_pid, &status, WNOHANG) < 0) { SYSERRprintf("Cannot collect child %d", si->si_pid); return; } finish_child(si->si_pid, status); + + /* Check other child(ren)'s status to avoid zombie process */ + while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { + finish_child(pid, status); + } } void trigger_setup(void) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/triggers/unknown-error-trigger new/mcelog-1.20/triggers/unknown-error-trigger --- old/mcelog-1.0.8/triggers/unknown-error-trigger 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/triggers/unknown-error-trigger 2015-06-15 14:25:23.000000000 +0200 @@ -9,7 +9,7 @@ # CPU Linux CPU number that triggered the error # STATUS IA32_MCi_STATUS register value # ADDR IA32_MCi_ADDR register value -# MISC IA32_MCi_MISC regiser value +# MISC IA32_MCi_MISC register value # MCGSTATUS IA32_MCG_STATUS register value # MCGCAP IA32_MCG_CAP register value # For details on the register layout please see the Intel SDM http://www.intel.com/sdm diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/mcelog-1.0.8/version.h new/mcelog-1.20/version.h --- old/mcelog-1.0.8/version.h 2014-12-20 19:35:05.000000000 +0100 +++ new/mcelog-1.20/version.h 2015-06-15 14:25:23.000000000 +0200 @@ -1,2 +1,2 @@ -#define MCELOG_VERSION "1.0pre" +#define MCELOG_VERSION "1.20" ++++++ mcelog_invert_prefill_db_warning.patch ++++++ --- /var/tmp/diff_new_pack.sdK5G2/_old 2015-07-05 17:57:40.000000000 +0200 +++ /var/tmp/diff_new_pack.sdK5G2/_new 2015-07-05 17:57:40.000000000 +0200 @@ -2,11 +2,11 @@ memdb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) -Index: mcelog-1.0.1/memdb.c +Index: mcelog-1.20/memdb.c =================================================================== ---- mcelog-1.0.1.orig/memdb.c -+++ mcelog-1.0.1/memdb.c -@@ -421,11 +421,11 @@ void prefill_memdb(void) +--- mcelog-1.20.orig/memdb.c 2015-06-15 14:25:23.000000000 +0200 ++++ mcelog-1.20/memdb.c 2015-06-15 15:15:25.925718586 +0200 +@@ -422,11 +422,11 @@ md->location = xstrdup(bl); md->name = xstrdup(dmi_getstring(&d->header, d->device_locator)); }
