On Monday 17 March 2008 11:55:28 pm Konstantin Baydarov wrote:
> Problem:
> Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI
> watchdog enabled) hangs when kernel modules are automatically loaded.

Konstantin,

The description below doesn't mention how module loading comes into picture.

>
> Root Cause:
>   Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the
> whole system to hang.
>   If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and
> Master CPU haven't unset debugger_active, 

An NMI watchdog can't occur until kgdb_wait function returns, control goes to 
kgdb_nmihook, which returns control to kgdb_notify, which in turn returns 
through the notify chain call returns, do_nmi, and then to entry.S, where an 
iret is executed. (NMI is disabled until iret is executed).

Compare to this to what the master CPU does: master CPU just has to unlock all 
slave locks and then immediately set debugger_active to 0. (The only 
exeception to this is when debugger_step is set. More about this below).

The later can be executed much quicker than the former and while in theory the 
former can execute before the later, it can't happen in a real-life 
situation.

There is a delay of mdelay(2), when debugger_step is set and master debugger 
lets other CPUs run (kgdb_contthread == 0), which could potentially trigger 
this race. Could you please confirm whether any of the following two solve 
your problem?

1. Comment out these lines from kgdb_handle_exception
+       if (debugger_step)
+               mdelay(2);


2. Execute this command from gdb as soon as it is started "set 
scheduler-locking on"


> then Slave CPU can reenter 
> kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is
> zero(Master CPU have set procindebug[MasterCPU] to zero before exit), Slave
> loops in kgdb_wait(): ...
>       /* Wait till master processor goes completely into the debugger.
>        */
>       while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
>               int i = 10;     /* an arbitrary number */
>
>               while (--i)
>                       cpu_relax();
>       }
> ...
> Slave CPU loops until Master CPU completely exits KGDB and set
> debugger_active to zero. But when debugger_active became zero, Slave CPU
> don't leaves loop, instead it hangs in while loop, because it starts to
> check procindebug[-1], because atomic_read(&debugger_active) = 0: ...
> while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...}
> ...
> For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler
> and stops accept NMIs. It leads to whole system hang.
>
> How Solved:
>   New atomic variable debugger_exiting was added. It's set when Master CPU
> starts waiting Slave CPUs, and is reset after debugger_active is set to
> zero. Variable debugger_exiting is checked in kgdb_notify() and
> kgdb_nmihook wouldn't be called until debugger_exiting equal zero. So
> debugger_exiting guaranties that Slave CPU won't reenter kgdb_wait() until
> Master CPU completely leaves KGDB. Patch against kernel 2.6.24.3.

I would strongly recommend adding any more locking variables. As it is we've 
sufficient difficulty analyzing races :-).
-Amit

>
> Signed-off-by: Konstantin Baydarov <[EMAIL PROTECTED]>
>
>  arch/x86/kernel/kgdb_32.c |    9 ++++++---
>  arch/x86/kernel/kgdb_64.c |    9 ++++++---
>  include/linux/kgdb.h      |    1 +
>  kernel/kgdb.c             |    4 ++++
>  4 files changed, 17 insertions(+), 6 deletions(-)
>
> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> @@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b
>
>       switch (cmd) {
>       case DIE_NMI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
>               }
>               return NOTIFY_DONE;
>       case DIE_NMI_IPI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       if (kgdb_nmihook(raw_smp_processor_id(), regs))
>                               return NOTIFY_DONE;
> @@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b
>               }
>               return NOTIFY_DONE;
>       case DIE_NMIWATCHDOG:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> @@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b
>
>       switch (cmd) {
>       case DIE_NMI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
>               }
>               return NOTIFY_DONE;
>       case DIE_NMI_IPI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       if (kgdb_nmihook(raw_smp_processor_id(), regs))
>                               return NOTIFY_DONE;
> @@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b
>               }
>               return NOTIFY_DONE;
>       case DIE_NMIWATCHDOG:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h
> +++ ko_2_6_24_3_kgdb/include/linux/kgdb.h
> @@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_
>  extern int kgdb_nmihook(int cpu, void *regs);
>  extern int debugger_step;
>  extern atomic_t debugger_active;
> +extern atomic_t debugger_exiting;
>  #else
>  /* Stubs for when KGDB is not set. */
>  static const atomic_t debugger_active = ATOMIC_INIT(0);
> Index: ko_2_6_24_3_kgdb/kernel/kgdb.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c
> +++ ko_2_6_24_3_kgdb/kernel/kgdb.c
> @@ -117,6 +117,8 @@ int debugger_step;
>  static atomic_t kgdb_sync = ATOMIC_INIT(-1);
>  atomic_t debugger_active;
>  EXPORT_SYMBOL(debugger_active);
> +atomic_t debugger_exiting = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(debugger_exiting);
>
>  /* Our I/O buffers. */
>  static char remcom_in_buffer[BUFMAX];
> @@ -1526,6 +1528,7 @@ default_handle:
>       atomic_set(&procindebug[processor], 0);
>
>       if (!debugger_step || !kgdb_contthread) {
> +             atomic_set(&debugger_exiting, 1);
>               for (i = 0; i < NR_CPUS; i++)
>                       spin_unlock(&slavecpulocks[i]);
>               /* Wait till all the processors have quit
> @@ -1557,6 +1560,7 @@ default_handle:
>   kgdb_restore:
>       /* Free debugger_active */
>       atomic_set(&debugger_active, 0);
> +     atomic_set(&debugger_exiting, 0);
>       atomic_set(&kgdb_sync, -1);
>       clocksource_touch_watchdog();
>       kgdb_softlock_skip[processor] = 1;
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2008.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> Kgdb-bugreport mailing list
> Kgdb-bugreport@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport



-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Kgdb-bugreport mailing list
Kgdb-bugreport@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport

Reply via email to