Problem:
Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI
watchdog enabled) hangs when kernel modules are automatically loaded.
Root Cause:
Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the whole
system to hang.
If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and
Master CPU haven't unset debugger_active, then Slave CPU can reenter
kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is zero(Master
CPU have set procindebug[MasterCPU] to zero before exit), Slave loops in
kgdb_wait():
...
/* Wait till master processor goes completely into the debugger.
*/
while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
int i = 10; /* an arbitrary number */
while (--i)
cpu_relax();
}
...
Slave CPU loops until Master CPU completely exits KGDB and set debugger_active
to zero.
But when debugger_active became zero, Slave CPU don't leaves loop, instead it
hangs in while loop, because it starts to check procindebug[-1], because
atomic_read(&debugger_active) = 0:
...
while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...}
...
For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler and
stops accept NMIs. It leads to whole system hang.
How Solved:
New atomic variable debugger_exiting was added. It's set when Master CPU
starts
waiting Slave CPUs, and is reset after debugger_active is set to zero. Variable
debugger_exiting is checked in kgdb_notify() and kgdb_nmihook wouldn't be
called until debugger_exiting equal zero. So debugger_exiting guaranties that
Slave CPU won't reenter kgdb_wait() until Master CPU completely leaves KGDB.
Patch against kernel 2.6.24.3.
Signed-off-by: Konstantin Baydarov <[EMAIL PROTECTED]>
arch/x86/kernel/kgdb_32.c | 9 ++++++---
arch/x86/kernel/kgdb_64.c | 9 ++++++---
include/linux/kgdb.h | 1 +
kernel/kgdb.c | 4 ++++
4 files changed, 17 insertions(+), 6 deletions(-)
Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
===================================================================
--- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c
+++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
@@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b
switch (cmd) {
case DIE_NMI:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
kgdb_nmihook(raw_smp_processor_id(), regs);
return NOTIFY_STOP;
}
return NOTIFY_DONE;
case DIE_NMI_IPI:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
if (kgdb_nmihook(raw_smp_processor_id(), regs))
return NOTIFY_DONE;
@@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b
}
return NOTIFY_DONE;
case DIE_NMIWATCHDOG:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
kgdb_nmihook(raw_smp_processor_id(), regs);
return NOTIFY_STOP;
Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
===================================================================
--- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c
+++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
@@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b
switch (cmd) {
case DIE_NMI:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
kgdb_nmihook(raw_smp_processor_id(), regs);
return NOTIFY_STOP;
}
return NOTIFY_DONE;
case DIE_NMI_IPI:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
if (kgdb_nmihook(raw_smp_processor_id(), regs))
return NOTIFY_DONE;
@@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b
}
return NOTIFY_DONE;
case DIE_NMIWATCHDOG:
- if (atomic_read(&debugger_active)) {
+ if (atomic_read(&debugger_active) &&
+ !atomic_read(&debugger_exiting)) {
/* KGDB CPU roundup */
kgdb_nmihook(raw_smp_processor_id(), regs);
return NOTIFY_STOP;
Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h
===================================================================
--- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h
+++ ko_2_6_24_3_kgdb/include/linux/kgdb.h
@@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_
extern int kgdb_nmihook(int cpu, void *regs);
extern int debugger_step;
extern atomic_t debugger_active;
+extern atomic_t debugger_exiting;
#else
/* Stubs for when KGDB is not set. */
static const atomic_t debugger_active = ATOMIC_INIT(0);
Index: ko_2_6_24_3_kgdb/kernel/kgdb.c
===================================================================
--- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c
+++ ko_2_6_24_3_kgdb/kernel/kgdb.c
@@ -117,6 +117,8 @@ int debugger_step;
static atomic_t kgdb_sync = ATOMIC_INIT(-1);
atomic_t debugger_active;
EXPORT_SYMBOL(debugger_active);
+atomic_t debugger_exiting = ATOMIC_INIT(0);
+EXPORT_SYMBOL(debugger_exiting);
/* Our I/O buffers. */
static char remcom_in_buffer[BUFMAX];
@@ -1526,6 +1528,7 @@ default_handle:
atomic_set(&procindebug[processor], 0);
if (!debugger_step || !kgdb_contthread) {
+ atomic_set(&debugger_exiting, 1);
for (i = 0; i < NR_CPUS; i++)
spin_unlock(&slavecpulocks[i]);
/* Wait till all the processors have quit
@@ -1557,6 +1560,7 @@ default_handle:
kgdb_restore:
/* Free debugger_active */
atomic_set(&debugger_active, 0);
+ atomic_set(&debugger_exiting, 0);
atomic_set(&kgdb_sync, -1);
clocksource_touch_watchdog();
kgdb_softlock_skip[processor] = 1;
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Kgdb-bugreport mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport