The common crash_stop code, including the design level documentation.

Signed-off-by: Keith Owens <[EMAIL PROTECTED]>
---
 kernel/Makefile     |    1 
 kernel/crash_stop.c |  843 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c        |    5 
 3 files changed, 848 insertions(+), 1 deletion(-)

Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_CRASH_STOP_SUPPORTED) += crash_stop.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <[EMAIL PROTECTED]>, the -fno-omit-frame-pointer is
Index: linux/kernel/crash_stop.c
===================================================================
--- /dev/null
+++ linux/kernel/crash_stop.c
@@ -0,0 +1,843 @@
+/*
+ * linux/kernel/crash_stop.c
+ *
+ * Copyright (C) 2006 Keith Owens <[EMAIL PROTECTED]>
+ *
+ * Bring the system to a crash stop for debugging by stopping all the online
+ * cpus apart from the current cpu.  To interrupt the other cpus, first send a
+ * normal IPI, if any cpus have not responded after a few seconds then send a
+ * non-maskable interrupt.
+ *
+ * Most of this code disappears with CONFIG_SMP=n.  It devolves to running the
+ * callback routine on this cpu as the monarch and setting up the saved state
+ * for this cpu.  That gives a common interface for debug style tools, even on
+ * UP.
+ *
+ * These routines can be used by any debug style code that needs to stop the
+ * other cpus in the system, including those cpus that are not responding to
+ * normal interrupts.  Debug style code includes debuggers such as kdb, kgdb,
+ * nlkd as well as dump tools such as netdump, lkcd, kdump.  All these tools
+ * have the same basic synchronization requirements, the need to stop all the
+ * cpus, save the complete state of the tasks that were running then do some
+ * work on the current cpu.
+ *
+ * For each invocation of crash_stop, one cpu is the monarch, the other cpus
+ * are slaves.  There is no external guarantee of ordering between monarch and
+ * slave events.  The most common case is when the monarch is invoked via
+ * crash_stop(), it then drives the debugger's callback on the slave cpus,
+ * followed by the callback on the monarch cpu.
+ *
+ * Some architectures (IA64 in particular) define their own global machine
+ * synchronization events where a global event can drive the slave cpus ether
+ * before or after the monarch.  See INIT in Documentation/ia64/mca.txt.
+ *
+ * To hide the external monarch/slave races from the users of crash_stop, this
+ * code enforces a standard order on the events.  The debugger's callback
+ * routine is invoked on all the slaves "at the same time", followed 10 ms
+ * later by the callback on the monarch cpu.  Typically the callback will spin
+ * on the slave cpus until the monarch callback has done its work and released
+ * the slave cpus.
+ *
+ * There is no guarantee that all online cpus will be in crash_stop state when
+ * the monarch is entered.  If a cpu or chipset is so badly hung that it will
+ * not even respond to NMI then there will be no state for that cpu in
+ * crash_stop_running_process.
+ *
+ * A live locked system can result in a slave cpu processing the crash_stop IPI
+ * _after_ the monarch cpu has done its processing and left crash_stop status.
+ * The slave will not service the normal IPI fast enough (it is live locked
+ * with interrupts disabled) so it will be interrupted by NMI.  The monarch
+ * does its work and leaves crash_stop.  Later the slave gets out of the live
+ * lock and services the crash_stop IPI, but now there is no monarch to do
+ * anything.  To catch this delayed event, a crash_stop IPI is ignored if there
+ * is no current monarch.
+ *
+ * For some events, we cannot tell straight away if we want to debug the event
+ * or not.  For example, an IA64 MCA is architecturally defined to stop all the
+ * slaves before entering the monarch.  Only when the monarch is entered do we
+ * get any data on the event, it is only on the monarch that we can tell if the
+ * MCA is recoverable or not.  In this case, the monarch must call
+ * crash_stop_recovered() instead of crash_stop().  crash_stop_recovered()
+ * releases all the slaves.  Neither the slaves nor the monarch will use the
+ * callback routine.
+ *
+ * All routines are entered with interrupts disabled.  If necessary, the caller
+ * must disable interrupts before calling crash_stop.
+ */
+
+
+/* There are several possible scenarios for using crash_stop:
+ *
+ * (1) An explicit call to crash_stop from debugging code.  For example, a
+ *     direct entry into a debugger or an explicit request to dump via sysrq.
+ *     The debugging code calls crash_stop() which stops the slaves.
+ *
+ * (2) A nested call to crash_stop on the same cpu.  For example, a user is
+ *     debugging and they decide to take a kernel dump from inside the
+ *     debugger.  The debugger has already brought the system to crash_stop
+ *     state so the dump callback will be called on the current cpu (the
+ *     monarch) but not on the slaves.  The dump code uses the data that is
+ *     already in crash_stop_running_process[].
+ *
+ * (3) Concurrent calls to crash_stop on separate cpus.  One cpu will become
+ *     the monarch for one of the events and interrupt all the others,
+ *     including any cpus that are also trying to enter crash_stop.  When the
+ *     current monarch finishes, the other cpus will race for the crash_stop
+ *     lock and one will become the new monarch (assuming the system is still
+ *     usable).
+ *
+ * (4) A system error occurs and drives the notify_die callback chain, this one
+ *     can be tricky.  It is not known which entries on the notify_die chain
+ *     will do any work, but all of them need to see the same system state.  An
+ *     arch dependent crash_stop callback is called at the start and end of the
+ *     notify_die chain.  At the start it brings the system into crash_stop
+ *     state, using its own callbacks on the slave cpus.  Then it holds the
+ *     slave cpus and releases the monarch cpu.  This allows the rest of the
+ *     entries on the notify_die chain to run, each of them can call crash_stop
+ *     and run their callback on the current cpu and the slaves.  At the end of
+ *     the notify_die chain, the main crash_stop code releases the slave cpus.
+ *     This gives a consistent view of the system to all the entries on the
+ *     notify_die chain.
+ *
+ *     To make things more interesting, crash_stop() can be entered for one
+ *     reason then a software interrupt or NMI can come over the top.  That
+ *     will result in a notify_chain being run while the system is already in
+ *     crash_stop state.  Which means that any calls from the notify_chain to
+ *     crash_stop() must be treated as a nested calls.
+ *
+ *     Finally it is just possible to have multiple levels of notify_chain
+ *     running at the same time.  For example, an oops occurs and drives the
+ *     notify_chain.  At the start of that chain, the slaves are put into
+ *     crash_stop state and the monarch is allowed to run the chain.  A
+ *     callback on the chain breaks or loops and a second scan of the
+ *     notify_chain is done for this nested failure.  For this case, crash_stop
+ *     must ignore the use of the second notify_chain and treat any calls as
+ *     nested ones.
+ *
+ * The various states are a little complicated, because the code has to cope
+ * with normal calls, nested calls, concurrent calls on separate cpus,
+ * keeping a consistent view for the life of a notify_chain plus nested events
+ * that involve notify_chains.  And do it all without deadlocks, particularly
+ * on non-maskable interrupts.  A few rules :-
+ *
+ *   Variables cs_lock_owner, cs_monarch and cs_notify_chain_owner hold a cpu
+ *   number, -1 is 'not set'.  cs_notify_chain_depth is a counter.  These
+ *   variables are only updated on the monarch cpu.  The variables are
+ *   protected by cs_lock or by the fact that the current cpu is handling a
+ *   nested monarch event.
+ *
+ *   Entering a nested call only affects the monarch cpu.  The slave cpus will
+ *   continue to spin in the callback for the first crash_stop() event.  Nested
+ *   calls cannot take cs_lock, they would deadlock.
+ *
+ *   Returning from a nested call does not clear cs_monarch nor release the
+ *   slaves.
+ *
+ *   If a monarch gets the lock and cs_notify_chain_owner is not the current
+ *   cpu then another cpu is already running a notify_chain.  This monarch must
+ *   back off and wait for the other cpu to finish running its notify_chain.
+ *
+ *   Returning from a notify_chain call clears cs_monarch but does not release
+ *   the slaves.  Instead the slaves loop inside this code, in the expectation
+ *   that another notify_chain driven routine will call crash_stop and will
+ *   need the slaves.  Unlike a nested call, the slaves will use the supplied
+ *   callback for each entry on the notify_chain that calls crash_stop().
+ *
+ *   If cs_notify_chain_owner is already set to the monarch cpu on entry to a
+ *   notify_chain then ignore the use of the chain.  Any calls to crash_stop()
+ *   from entries on the chain will be treated as nested calls.
+ *
+ * Why the difference between nested calls and a notify_chain?  Mainly because
+ * the entries on a notify_chain are defined to be separate, also crash_stop
+ * can easily detect the start and end of running the chain.  With a nested
+ * call, there is no way to tell if the first callback will use crash_stop() a
+ * second time.  Nested calls can result from explicit calls to other debug
+ * style code or from an error in the current callback.  On a nested call, the
+ * monarch callback owns and controls the slaves, they are out of crash_stop()
+ * control.  Only the monarch callback can release the slaves by leaving
+ * crash_stop() state, at which point the second call to crash_stop is not a
+ * nested call.
+ */
+
+/* FIXME (maybe): There is a possible deadlock scenario:
+ *
+ * A monarch cpu calls crash_stop().
+ * All the slave cpus are put into crash_stop() state.
+ * One of the slaves gets a non-maskable interrupt - where from?
+ * The slave calls crash_stop() and spins waiting for cs_lock.
+ * The monarch exits and waits for all the slaves to exit.
+ * The slave that took NMI will not exit until cs_lock is free.
+ * The monarch will not free cs_lock until all the slaves exit.
+ *
+ * This deadlock can only occur if some external hardware generates an NMI and
+ * that NMI is sent to slave cpus instead of the monarch.  Until that situation
+ * can be demonstrated (and any workaround can be tested), I am going to ignore
+ * this scenario - KAO.
+ */
+
+/* Danger - Here there be races and compiler/hardware reordering gotchas.
+ *
+ * This code relies on variables that must be set on one cpu and seen on other
+ * cpus in the right order.  Both the compiler and the hardware can reorder
+ * operations, so use memory barriers when necessary.
+ *
+ * The biggest problem is that the compiler does not know about the other cpus,
+ * so the compiler may incorrectly think that an operation on this cpu has no
+ * side effects and may move the operation or even optimize it away.  To be on
+ * the safe side and to document the ordering requirements, barriers have been
+ * used wherever there is even the remote possibility of a current or future
+ * compiler being too smart for its own good.  Look for 'barrier:' comments.
+ *
+ * Obviously calls to spin_lock/spin_unlock are already barriers.  Only the
+ * additional barrier operations are commented.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/crash_stop.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/nmi.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+
+static DEFINE_SPINLOCK(cs_lock);
+static int cs_lock_owner = -1;
+static int cs_monarch = -1;
+static int cs_monarch_depth;
+static int cs_notify_chain_owner = -1;
+static int cs_notify_chain_depth;
+static int cs_notify_chain_ended;
+static int cs_leaving;
+static atomic_t cs_common_cpu_slaves;
+
+static int cs_recovered;
+
+static cpumask_t cs_cpu_mask, cs_sent_ipi, cs_sent_nmi;
+
+static struct crash_stop_running_process crash_stop_running_process[NR_CPUS];
+
+struct cs_global cs_global;
+
+/* Use a local version of mdelay because RedHat patch their kernel to give a
+ * warning when mdelay is used with interrupts disabled.  Why do RedHat do
+ * these silly things, have they never heard of debugging?
+ */
+static void
+cs_mdelay(int ms)
+{
+       while (ms > 0) {
+               touch_nmi_watchdog();
+               udelay(1000);   /* barrier: udelay -> cpu_relax -> barrier */
+               --ms;
+       }
+}
+
+static void
+cs_cpu_relax_watchdog(void)
+{
+       touch_nmi_watchdog();
+       cpu_relax();    /* barrier: cpu_relax -> barrier */
+}
+
+/* barrier: On some architectures, set_mb() uses xchg so it only works on 1, 2,
+ * 4 or 8 byte quantities.  This violates Documentation/memory-barriers.txt
+ * which implies that set_mb can be used on any data type.  The only
+ * requirement is that their be a memory barrier after assigning the value, so
+ * define our own version that uses generic operations.
+ */
+#define cs_set_mb(var, value) do { (var) = (value); mb(); } while (0)
+
+/* If we cannot safely use an external print routine then save any messages in
+ * a local buffer, allowing 256 bytes of messages per cpu.  This code is not
+ * performance sensitive so we take the time to left justify the entire buffer
+ * instead of using ring pointers, this removes the need for users to cope with
+ * wrapped cs_msg text when analysing a crash_stopped kernel.
+ */
+
+static char cs_msg[256*NR_CPUS];
+static DEFINE_SPINLOCK(cs_msg_lock);
+static int cs_msg_lock_owner = -1;
+
+static asmlinkage int
+cs_printk(const char * fmt, ...)
+{
+       int l, ret, shift;
+       va_list ap;
+       /* If we get NMI'd during this code then discard any messages for the
+        * nested event.  Either way we lose some messages and it is far easier
+        * (and safer) to discard the nested messages.
+        */
+       if (cs_msg_lock_owner == smp_processor_id())
+               return 0;
+       spin_lock(&cs_msg_lock);
+       /* barrier: setting cs_msg_lock_owner must not move down */
+       set_mb(cs_msg_lock_owner, smp_processor_id());
+       l = strlen(cs_msg);
+       while (1) {
+               va_start(ap, fmt);
+               ret = vsnprintf(cs_msg+l, sizeof(cs_msg)-l, fmt, ap);
+               va_end(ap);
+               if (l == 0 || ret < sizeof(cs_msg)-l)
+                       break;
+               shift = sizeof(cs_msg) / 10;
+               shift = max(shift, ret);
+               shift = min(shift, l);
+               l -= shift;
+               memcpy(cs_msg, cs_msg+shift, l);
+               memset(cs_msg+l, 0, sizeof(cs_msg)-l);
+       }
+       /* barrier: clearing cs_msg_lock_owner must not move up */
+       barrier();
+       cs_msg_lock_owner = -1;
+       spin_unlock(&cs_msg_lock);
+       return ret;
+}
+
+/* At the start of a notify_chain, all cpus are driven into this routine, via
+ * cs_common_cpu().  It is a dummy callback, cs_common_cpu() takes care of
+ * holding the slave cpus until the end of the notify_chain.
+ */
+static void
+cs_notify_callback(int monarch, void *data)
+{
+}
+
+/* Called by the arch specific crash_stop code, when they see a notify_chain()
+ * event that debug style code might care about.
+ */
+void
+cs_notify_chain_start(struct pt_regs *regs)
+{
+       int cpu = smp_processor_id();
+       WARN_ON(!irqs_disabled());
+       while (cs_leaving)
+               cs_cpu_relax_watchdog();
+       if (cs_lock_owner == cpu || cs_notify_chain_owner == cpu) {
+               /* This cpu is already the crash_stop monarch, so the slaves
+                * are already stopped.  Ignore the fact that we are being
+                * called from a notify_chain, instead any calls to crash_stop
+                * from the chain will be treated as nested calls.
+                */
+               ++cs_notify_chain_depth;
+               return;
+       }
+retry:
+       spin_lock(&cs_lock);
+       if (cs_notify_chain_owner >= 0) {
+               /* another monarch is running a notify_chain, back off */
+               spin_unlock(&cs_lock);
+               cs_mdelay(1);
+               goto retry;
+       }
+       set_mb(cs_lock_owner, cpu);
+       set_mb(cs_notify_chain_owner, cpu);
+       set_mb(cs_lock_owner, -1);
+       spin_unlock(&cs_lock);
+       crash_stop(cs_notify_callback, NULL, NULL, regs, __FUNCTION__);
+}
+
+/* Called by the arch specific crash_stop code, when they reach the end of a
+ * notify_chain() event that debug style code might care about.  It is also
+ * called by notifier_call_chain() when it does an early termination of the
+ * chain, that call is required because the arch code will now not be called
+ * for the end of the chain.  For the latter case, do not assume that interupts
+ * are disabled.  Which in turn means using raw_smp_processor_id() to check if
+ * this cpu is running a notify_chain or not.
+ */
+void
+cs_notify_chain_end(void)
+{
+       int cpu = raw_smp_processor_id();
+       while (cs_leaving)
+               cs_cpu_relax_watchdog();
+       if (cs_lock_owner == cpu || cs_notify_chain_owner == cpu) {
+               WARN_ON(!irqs_disabled());
+               if (cs_notify_chain_depth) {
+                       /* end of a nested chain */
+                       --cs_notify_chain_depth;
+                       return;
+               }
+               spin_lock(&cs_lock);
+               set_mb(cs_lock_owner, cpu);
+               /* barrier: setting cs_notify_chain_ended must not move down */
+               set_mb(cs_notify_chain_ended, 1);
+               while (atomic_read(&cs_common_cpu_slaves))
+                       cs_cpu_relax_watchdog();
+               set_mb(cs_notify_chain_ended, 0);
+               set_mb(cs_notify_chain_owner, -1);
+               set_mb(cs_lock_owner, -1);
+               spin_unlock(&cs_lock);
+       }
+}
+
+static void
+cs_online_cpu_status(const char *text)
+{
+#ifdef CONFIG_SMP
+       int slaves = num_online_cpus() - 1, count = 0, cpu, unknown;
+       if (!slaves)
+               return;
+       for_each_online_cpu(cpu) {
+               if (cpu_isset(cpu, cs_cpu_mask) &&
+                   cpu != smp_processor_id())
+                       ++count;
+       }
+       unknown = slaves - count;
+       if (unknown == 0)
+               cs_global.print(
+                       "All cpus are in crash_stop for %s\n", text);
+       else {
+               int first_print = 1, start = -1, stop = -1;
+               cs_global.print("%d cpu%s ",
+                       unknown,
+                       unknown == 1 ? "" : "s");
+               for (cpu = 0; cpu <= NR_CPUS; ++cpu) {
+                       if (cpu == NR_CPUS ||
+                           !cpu_online(cpu) ||
+                           cpu_isset(cpu, cs_cpu_mask) ||
+                           cpu == smp_processor_id())
+                               stop = cpu;
+                       if (stop >= 0 && start >= 0) {
+                               if (first_print) {
+                                       cs_global.print("(");
+                                       first_print = 0;
+                               } else {
+                                       cs_global.print(", ");
+                               }
+                               cs_global.print("%d", start);
+                               if (stop - 1 > start)
+                                       cs_global.print("-%d", stop - 1);
+                               stop = -1;
+                               start = -1;
+                       }
+                       if (cpu < NR_CPUS &&
+                           cpu_online(cpu) &&
+                           !cpu_isset(cpu, cs_cpu_mask) &&
+                           cpu != smp_processor_id() &&
+                           start < 0)
+                               start = cpu;
+               }
+               cs_global.print(") %s not in crash_stop for %s, %s state is 
unknown\n",
+                       unknown == 1 ? "is" : "are",
+                       text,
+                       unknown == 1 ? "its" : "their");
+       }
+#else  /* !CONFIG_SMP */
+       cs_global.print(
+               "All cpus are in crash_stop for %s\n", text);
+#endif /* CONFIG_SMP */
+}
+
+#ifdef CONFIG_SMP
+/* Should only be called by the arch interrupt handlers, when the slave cpus
+ * receive the crash_stop specific IPI.
+ */
+void
+cs_common_ipi(void)
+{
+       while (cs_leaving)
+               cs_cpu_relax_watchdog();
+       if (!cs_global.callback) {
+               printk(KERN_DEBUG "Ignoring late cs_ipi on cpu %d\n",
+                      smp_processor_id());
+               return;
+       }
+       crash_stop_slave();
+}
+
+/* Should only be called by the arch specific NMI handlers, to see if this NMI
+ * is for crash_stop or for something else.  On most architectures, an NMI
+ * signal carries no state so we have to maintain an external state to indicate
+ * why it was sent.
+ *
+ * Note: this function is only valid when a slave is entering crash_stop()
+ * state.  Due to races between the time that the monarch releases a slave and
+ * the slave actually exiting, it is not safe to call this routine while a
+ * slave is leaving.  It is up to the calling code to save the state of
+ * crash_stop_sent_nmi() on entry if they need to test it on exit.
+ */
+int
+crash_stop_sent_nmi(void)
+{
+       return cpu_isset(smp_processor_id(), cs_sent_nmi);
+}
+#endif /* CONFIG_SMP */
+
+/* Should only be called by the arch specific crash_stop code, after they have
+ * saved any arch specific state.  The call chain is :-
+ *
+ * crash_stop() [monarch] or cs_common_ipi() [slave] ->
+ *   crash_stop_slave() [common front end code] ->
+ *     cs_arch_cpu() [arch dependent code] ->
+ *       cs_common_cpu() [common back end code] ->
+ *         external crash_stop callback
+ *
+ * When cs_common_cpu() is entered for a slave cpu, it must spin while
+ * cs_monarch < 0.  That enforces the order of slave callbacks first, then
+ * monarch callback.
+ *
+ * When handling a notify_chain, park the slave cpus in this holding routine
+ * while the monarch cpu runs down the notify_chain.  If any entry on the
+ * notify_chain calls crash_stop_slave() than release the slaves to the
+ * corresponding crash_stop callback.  On return from the callback, put them
+ * back in a holding loop.  The state of the slave cpus is not significantly
+ * changed by this process and each caller of crash_stop_slave() gets the same
+ * data in crash_stop_running_process.  IOW, all entries on the notify_chain
+ * see the state that was saved by the first crash_stop entry on the chain, not
+ * some state that changes as the monarch runs the notify_chain.
+ */
+void
+cs_common_cpu(int monarch)
+{
+       if (monarch) {
+               if (!cs_recovered) {
+                       ++cs_monarch_depth;
+                       cs_global.callback(1, cs_global.data);
+                       --cs_monarch_depth;
+               }
+               return;
+       }
+       atomic_inc(&cs_common_cpu_slaves);
+       do {
+               /* slaves wait until the monarch enters */
+               while (cs_monarch < 0 && !cs_notify_chain_ended)
+                       cs_cpu_relax_watchdog();
+               if (cs_notify_chain_ended)
+                       break;
+               if (!cs_recovered)
+                       cs_global.callback(0, cs_global.data);
+               /* slaves wait until the monarch leaves */
+               while (cs_monarch >= 0)
+                       cs_cpu_relax_watchdog();
+       } while (cs_notify_chain_owner >= 0);
+       atomic_dec(&cs_common_cpu_slaves);
+}
+
+#ifdef CONFIG_SMP
+/* The monarch has to wait for the slaves to enter crash_stop state.  Wait for
+ * up to 3 seconds plus an extra 100 ms per online cpu to cope with live lock
+ * on systems with large cpu counts.  These are arbitrary numbers, it might be
+ * worth exposing them as /sys values so sites can tune their debugging.
+ * Review this after we have more experience with this code - KAO.
+ */
+static void
+cs_wait_for_cpus(void)
+{
+       int count, prev_count = 0, sent_nmi = 0, t, wait_secs, slaves, cpu;
+       slaves = num_online_cpus() - 1;
+       wait_secs = 3 + (slaves * 100) / 1000;
+       cs_mdelay(100);
+       for (t = 0; t < wait_secs; ++t) {
+               count = 0;
+               slaves = num_online_cpus() - 1;
+               for_each_online_cpu(cpu) {
+                       if (cpu_isset(cpu, cs_cpu_mask))
+                               ++count;
+               }
+               if (count == slaves)
+                       break;
+               if (prev_count != count) {
+                       cs_global.print(
+                               "  %d out of %d cpus in crash_stop, "
+                               "waiting for the rest, timeout in %d "
+                               "second(s)\n",
+                               count+1, slaves+1, wait_secs-t);
+                       prev_count = count;
+               }
+               cs_mdelay(1000);
+               if (!sent_nmi && t == min(wait_secs / 2, 5)) {
+                       for_each_online_cpu(cpu) {
+                               if (cpu_isset(cpu, cs_cpu_mask) ||
+                                   cpu_isset(cpu, cs_sent_nmi) ||
+                                   cpu == smp_processor_id())
+                                       continue;
+                               if (!sent_nmi) {
+                                       cs_global.print(" sending NMI ");
+                                       sent_nmi = 1;
+                               }
+                               cpu_set(cpu, cs_sent_nmi);
+                               smp_wmb();
+                               cs_arch_send_nmi(cpu);
+                       }
+               }
+               if (t % 4 == 0)
+                       cs_global.print(".");
+       }
+}
+#endif /* CONFIG_SMP */
+
+static void
+cs_stop_the_slaves(void)
+{
+#ifdef CONFIG_SMP
+       int sent_ipi = 0, cpu;
+       for_each_online_cpu(cpu) {
+               if (cpu_isset(cpu, cs_cpu_mask) ||
+                   cpu_isset(cpu, cs_sent_ipi) ||
+                   cpu == smp_processor_id())
+                       continue;
+               cpu_set(cpu, cs_sent_ipi);
+               cs_arch_send_ipi(cpu);
+               sent_ipi = 1;
+       }
+       if (sent_ipi)
+               cs_wait_for_cpus();
+#endif /* CONFIG_SMP */
+}
+
+/**
+ * cs_cpu: - Put the current cpu into crash_stop state.
+ * @monarch: 0 for a slave cpu, 1 for the monarch cpu.
+ *
+ * Invoked on every cpu that is being stopped, with no externally defined order
+ * between monarch and slaves.  The arch independent running state is saved
+ * here, then cs_arch_cpu() saves any arch specific state, followed by
+ * invocation of cs_common_cpu() which drives the callback routine.
+ */
+static void
+cs_cpu(int monarch)
+{
+       struct crash_stop_running_process *r, prev;
+       int cpu = smp_processor_id();
+       cpu_set(cpu, cs_cpu_mask);
+       r = crash_stop_running_process + cpu;
+       prev = *r;
+       r->p = current;
+       r->regs = get_irq_regs();
+       r->prev = &prev;
+       if (!prev.p) {
+               if (monarch) {
+                       /* Top level call to crash_stop().  Delay 10 ms to give
+                        * the slave callbacks (see cs_common_cpu()) a chance
+                        * to get started before running the callback on the
+                        * monarch.
+                        */
+                       set_mb(cs_monarch, cpu);
+                       cs_mdelay(10);
+               }
+       }
+       cs_arch_cpu(monarch, r);
+       *r = prev;
+       if (!prev.p) {
+               if (monarch) {
+                       set_mb(cs_leaving, 1);
+                       set_mb(cs_monarch, -1);
+               }
+               cpu_clear(cpu, cs_sent_ipi);
+               cpu_clear(cpu, cs_sent_nmi);
+               /* barrier: cs_cpu_mask functions as the main filter
+                * for the state of the cpus, flush preceding updates
+                * to memory before clearing cs_cpu_mask.
+                */
+               smp_mb__before_clear_bit();
+               cpu_clear(cpu, cs_cpu_mask);
+       }
+}
+
+#ifdef CONFIG_SMP
+/* crash_stop_slave: - Put the current slave cpu into crash_stop state.  */
+void
+crash_stop_slave(void)
+{
+       while (cs_leaving)
+               cs_cpu_relax_watchdog();
+       cs_cpu(0);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * crash_stop: - Bring the system to a crash stop for debugging.
+ * @callback: After each cpu has been interrupted, the callback is invoked on
+ * that cpu, with the monarch flag set to 0.  After all cpus have responded or
+ * the timeout has been reached then the callback is invoked on the current cpu
+ * with the monarch flag set to 1.
+ * @data: Callback specific data, crash_stop does not use this data.
+ * @print: Optionally, the name of a debugger specific print routine.  If this
+ * is NULL then crash_stop will default to using cs_printk(), messages will be
+ * left justified in cs_msg[].
+ *
+ * Unlike stop_machine(), crash_stop() does not ask if the other cpus are
+ * ready to be stopped and will use non-maskable interrupts to stop cpus that
+ * do not respond after a few seconds.
+ *
+ * crash_stop() must be entered with interrupts disabled, it can even be
+ * entered from an NMI event.  It is the caller's responsibility to ensure that
+ * their print routine (if any) is safe in the current context.
+ *
+ * If the system has already entered a globally stopped state then sending IPI
+ * or NMI is pointless and may even be unsafe.  This particularly applies to
+ * MCA or global INIT on IA64, these events are already defined to stop the
+ * entire machine and they also prevent crash_stop() from sending any IPI or
+ * NMI events.  Only send IPI/NMI to cpus that are not yet in crash_stop state.
+ *
+ * The global structure crash_stop_running_process is updated with information
+ * about the tasks that are running on each cpu.  The debugger can use this
+ * information to start the analysis of the running tasks.
+ *
+ * This function cannot assume that the caller has already saved the pt_regs,
+ * so it does it anyway.  Some callers (e.g. oops) will have called
+ * set_irq_regs(), others (e.g. NMI watchdog) will not.
+ *
+ * Returns: 0 normal
+ *          -ENOSYS crash_stop is not supported on this architecture.
+ */
+
+int
+crash_stop(void (*callback)(int monarch, void *data),
+          void *data, printk_t print,
+          struct pt_regs *regs, const char *text)
+{
+       int cpu;
+       struct cs_global csg_save, csg = {
+               .callback = callback,
+               .data = data,
+               .print = print ? print : cs_printk,
+       };
+       struct pt_regs *old_regs;
+
+       WARN_ON(!irqs_disabled());
+retry:
+       if (!spin_trylock(&cs_lock)) {
+               if (cs_lock_owner == smp_processor_id()) {
+                       /* nested call on the same cpu */
+                       csg_save = cs_global;
+                       cs_set_mb(cs_global, csg);
+                       cs_online_cpu_status(text);
+                       cs_cpu(1);
+                       cs_set_mb(cs_global, csg_save);
+                       return 0;
+               }
+               /* concurrent call on another cpu */
+               while (cs_lock_owner != -1)
+                       cs_cpu_relax_watchdog();
+               goto retry;
+       }
+
+       if (cs_leaving) {
+               /* previous crash stop has not quite completed, back off */
+               spin_unlock(&cs_lock);
+               cs_mdelay(1);
+               goto retry;
+       }
+
+       if (cs_notify_chain_owner >= 0 &&
+           cs_notify_chain_owner != smp_processor_id()) {
+               /* another cpu is running a notify_chain, back off */
+               spin_unlock(&cs_lock);
+               cs_mdelay(1);
+               goto retry;
+       }
+
+       set_mb(cs_lock_owner, smp_processor_id());
+       old_regs = set_irq_regs(regs);
+       csg_save = cs_global;
+       cs_set_mb(cs_global, csg);
+       cs_stop_the_slaves();
+       cs_online_cpu_status(text);
+       cs_cpu(1);
+       set_mb(cs_leaving, 1);
+       if (cs_monarch < 0 && cs_notify_chain_owner < 0) {
+               /* leaving a normal call, wait for the slaves to exit */
+               for_each_online_cpu(cpu) {
+                       while (cpu_isset(cpu, cs_cpu_mask))
+                               cs_cpu_relax_watchdog();
+               }
+       }
+       cs_set_mb(cs_global, csg_save);
+       set_mb(cs_lock_owner, -1);
+       set_irq_regs(old_regs);
+       spin_unlock(&cs_lock);
+       set_mb(cs_leaving, 0);
+       return 0;
+}
+
+/**
+ * crash_stop_recovered: - Release any slaves in crash_stop state.
+ *
+ * On architectures that define their own global synchronization methods, the
+ * slave cpus may enter crash_stop state before the monarch.  If the monarch
+ * decides that the event is recoverable then the slaves need to be released
+ * from crash_stop, without invoking any callbacks.
+ *
+ * For recovered events, we do not always force the other cpus into slave
+ * state.  The assumption is that crash_stop_recovered() is only required on
+ * architectures that define their own global synchronization methods (e.g.
+ * IA64 MCA), in which case the architecture has already take care of the
+ * slaves.  If no slave cpu is in crash_stop() state then do nothing, otherwise
+ * wait until all the slaves are in crash_stop().
+ *
+ * If the code that calls crash_stop_recovered() is in a notify_chain then the
+ * caller must call cs_notify_chain_end() before crash_stop_recovered().
+ * Calling this function when this cpu is the notify_chain owner is assumed to
+ * be a nested call and it is silently ignored.  IOW it is a recovery from a
+ * nested event and we want to hold the slaves until we exit from the top level
+ * of crash_stop code.
+ */
+void
+crash_stop_recovered(void)
+{
+       int cpu, any_slaves = 0;
+
+       WARN_ON(!irqs_disabled());
+       while (cs_leaving)
+               cs_cpu_relax_watchdog();
+       if (cs_notify_chain_owner >= 0 &&
+           cs_notify_chain_owner == smp_processor_id())
+               return;
+retry:
+       spin_lock(&cs_lock);
+       if (cs_notify_chain_owner >= 0) {
+               /* another cpu is running a notify_chain, back off */
+               spin_unlock(&cs_lock);
+               cs_mdelay(1);
+               goto retry;
+       }
+       set_mb(cs_lock_owner, smp_processor_id());
+       for_each_online_cpu(cpu) {
+               if (cpu_isset(cpu, cs_cpu_mask) &&
+                   cpu != smp_processor_id()) {
+                       any_slaves = 1;
+                       break;
+               }
+       }
+       if (any_slaves) {
+               /* give cs_stop_the_slaves/cs_wait_for_cpus a safe print
+                * routine.
+                */
+               struct cs_global csg_save, csg = {
+                       .print = cs_printk,
+               };
+               csg_save = cs_global;
+               cs_set_mb(cs_global, csg);
+               cs_stop_the_slaves();
+               cs_set_mb(cs_global, csg_save);
+       }
+       set_mb(cs_recovered, 1);
+       set_mb(cs_monarch, smp_processor_id());
+       for_each_online_cpu(cpu) {
+               while (cpu_isset(cpu, cs_cpu_mask))
+                       cs_cpu_relax_watchdog();
+       }
+       set_mb(cs_recovered, 0);
+       set_mb(cs_monarch, -1);
+       set_mb(cs_lock_owner, -1);
+       spin_unlock(&cs_lock);
+       return;
+}
+
+/**
+ * crash_stop_slaves: - Return the number of slave cpus that the user will see.
+ *
+ * For a non-nested call, the user will see all the cpus that are in crash_stop
+ * state.  For a nested call, the user will not see any slave cpus.
+ */
+int
+crash_stop_slaves(void)
+{
+       if (cs_monarch_depth == 1)
+               return atomic_read(&cs_common_cpu_slaves);
+       else
+               return 0;
+}
Index: linux/kernel/sys.c
===================================================================
--- linux.orig/kernel/sys.c
+++ linux/kernel/sys.c
@@ -29,6 +29,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
+#include <linux/crash_stop.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -144,8 +145,10 @@ static int __kprobes notifier_call_chain
        while (nb) {
                next_nb = rcu_dereference(nb->next);
                ret = nb->notifier_call(nb, val, v);
-               if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+               if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) {
+                       cs_notify_chain_end();
                        break;
+               }
                nb = next_nb;
        }
        return ret;
-
To unsubscribe from this list: send the line "unsubscribe linux-arch" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to