On Monday 17 March 2008 11:55:28 pm Konstantin Baydarov wrote: > Problem: > Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI > watchdog enabled) hangs when kernel modules are automatically loaded.
Konstantin, The description below doesn't mention how module loading comes into picture. > > Root Cause: > Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the > whole system to hang. > If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and > Master CPU haven't unset debugger_active, An NMI watchdog can't occur until kgdb_wait function returns, control goes to kgdb_nmihook, which returns control to kgdb_notify, which in turn returns through the notify chain call returns, do_nmi, and then to entry.S, where an iret is executed. (NMI is disabled until iret is executed). Compare to this to what the master CPU does: master CPU just has to unlock all slave locks and then immediately set debugger_active to 0. (The only exeception to this is when debugger_step is set. More about this below). The later can be executed much quicker than the former and while in theory the former can execute before the later, it can't happen in a real-life situation. There is a delay of mdelay(2), when debugger_step is set and master debugger lets other CPUs run (kgdb_contthread == 0), which could potentially trigger this race. Could you please confirm whether any of the following two solve your problem? 1. Comment out these lines from kgdb_handle_exception + if (debugger_step) + mdelay(2); 2. Execute this command from gdb as soon as it is started "set scheduler-locking on" > then Slave CPU can reenter > kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is > zero(Master CPU have set procindebug[MasterCPU] to zero before exit), Slave > loops in kgdb_wait(): ... > /* Wait till master processor goes completely into the debugger. > */ > while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) { > int i = 10; /* an arbitrary number */ > > while (--i) > cpu_relax(); > } > ... > Slave CPU loops until Master CPU completely exits KGDB and set > debugger_active to zero. But when debugger_active became zero, Slave CPU > don't leaves loop, instead it hangs in while loop, because it starts to > check procindebug[-1], because atomic_read(&debugger_active) = 0: ... > while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...} > ... > For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler > and stops accept NMIs. It leads to whole system hang. > > How Solved: > New atomic variable debugger_exiting was added. It's set when Master CPU > starts waiting Slave CPUs, and is reset after debugger_active is set to > zero. Variable debugger_exiting is checked in kgdb_notify() and > kgdb_nmihook wouldn't be called until debugger_exiting equal zero. So > debugger_exiting guaranties that Slave CPU won't reenter kgdb_wait() until > Master CPU completely leaves KGDB. Patch against kernel 2.6.24.3. I would strongly recommend adding any more locking variables. As it is we've sufficient difficulty analyzing races :-). -Amit > > Signed-off-by: Konstantin Baydarov <[EMAIL PROTECTED]> > > arch/x86/kernel/kgdb_32.c | 9 ++++++--- > arch/x86/kernel/kgdb_64.c | 9 ++++++--- > include/linux/kgdb.h | 1 + > kernel/kgdb.c | 4 ++++ > 4 files changed, 17 insertions(+), 6 deletions(-) > > Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c > =================================================================== > --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c > +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c > @@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b > > switch (cmd) { > case DIE_NMI: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > kgdb_nmihook(raw_smp_processor_id(), regs); > return NOTIFY_STOP; > } > return NOTIFY_DONE; > case DIE_NMI_IPI: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > if (kgdb_nmihook(raw_smp_processor_id(), regs)) > return NOTIFY_DONE; > @@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b > } > return NOTIFY_DONE; > case DIE_NMIWATCHDOG: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > kgdb_nmihook(raw_smp_processor_id(), regs); > return NOTIFY_STOP; > Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c > =================================================================== > --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c > +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c > @@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b > > switch (cmd) { > case DIE_NMI: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > kgdb_nmihook(raw_smp_processor_id(), regs); > return NOTIFY_STOP; > } > return NOTIFY_DONE; > case DIE_NMI_IPI: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > if (kgdb_nmihook(raw_smp_processor_id(), regs)) > return NOTIFY_DONE; > @@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b > } > return NOTIFY_DONE; > case DIE_NMIWATCHDOG: > - if (atomic_read(&debugger_active)) { > + if (atomic_read(&debugger_active) && > + !atomic_read(&debugger_exiting)) { > /* KGDB CPU roundup */ > kgdb_nmihook(raw_smp_processor_id(), regs); > return NOTIFY_STOP; > Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h > =================================================================== > --- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h > +++ ko_2_6_24_3_kgdb/include/linux/kgdb.h > @@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_ > extern int kgdb_nmihook(int cpu, void *regs); > extern int debugger_step; > extern atomic_t debugger_active; > +extern atomic_t debugger_exiting; > #else > /* Stubs for when KGDB is not set. */ > static const atomic_t debugger_active = ATOMIC_INIT(0); > Index: ko_2_6_24_3_kgdb/kernel/kgdb.c > =================================================================== > --- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c > +++ ko_2_6_24_3_kgdb/kernel/kgdb.c > @@ -117,6 +117,8 @@ int debugger_step; > static atomic_t kgdb_sync = ATOMIC_INIT(-1); > atomic_t debugger_active; > EXPORT_SYMBOL(debugger_active); > +atomic_t debugger_exiting = ATOMIC_INIT(0); > +EXPORT_SYMBOL(debugger_exiting); > > /* Our I/O buffers. */ > static char remcom_in_buffer[BUFMAX]; > @@ -1526,6 +1528,7 @@ default_handle: > atomic_set(&procindebug[processor], 0); > > if (!debugger_step || !kgdb_contthread) { > + atomic_set(&debugger_exiting, 1); > for (i = 0; i < NR_CPUS; i++) > spin_unlock(&slavecpulocks[i]); > /* Wait till all the processors have quit > @@ -1557,6 +1560,7 @@ default_handle: > kgdb_restore: > /* Free debugger_active */ > atomic_set(&debugger_active, 0); > + atomic_set(&debugger_exiting, 0); > atomic_set(&kgdb_sync, -1); > clocksource_touch_watchdog(); > kgdb_softlock_skip[processor] = 1; > > ------------------------------------------------------------------------- > This SF.net email is sponsored by: Microsoft > Defy all challenges. Microsoft(R) Visual Studio 2008. > http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ > _______________________________________________ > Kgdb-bugreport mailing list > Kgdb-bugreport@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ Kgdb-bugreport mailing list Kgdb-bugreport@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport