Hi all,

I'm not sure this is the correct solution, but I'd like to throw this
out in order to illustrate a couple of cases where KGDB could deadlock
when debugging under SMP.  I've seen this in i386 and x86_64 versions.

It so happens that when you go into the INT3 handler, the IPI_NMI we
send to the slave processors may arrive when one of those processors
holds the runqueue (struct rq) lock.  This could cause a deadlock.  I've
seen it happen in two places:

1)  While looping inside the kgdb_handle_exception(), your net polling
Ethernet driver (mine is e1000) tries to free SKB's using
dev_kfree_sdb_any().  If you don't bump your preempt to HARDIRQ, the
function would eventually deadlock trying to get the rq lock
(specifically in try_to_wake_up()).
2)  After INT3 ISR calls kgdb_handle_exception(), you hold the slave
CPUs expecting to jump back on the next instruction via the DEBUG ISR.
However, just as you leave the INT3 ISR, you get hit with the pending
smp_apic_timer_interrrupt().  smp_apic_timer_interrrupt() eventually
deadlocks trying to get the same rq lock.

I guess there may be more of these gotchas, but my patch places
protections against the cases I've seen.  Thanks for taking a look.

Tim Nguyen


diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index a60550f..8fd52c9 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -54,6 +54,7 @@ extern int pid_max;
 /* How many times to count all of the waiting CPUs */
 #define ROUNDUP_WAIT           640000  /* Arbitrary, increase if
needed. */
 #define BUF_THREAD_ID_SIZE     16
+#define NMIHOOK_TRY            100
 
 /*
  * kgdb_initialized with a value of 1 indicates that kgdb is setup and
is
@@ -94,6 +95,8 @@ struct kgdb_arch *kgdb_ops = &arch_kgdb_
 
 static const char hexchars[] = "0123456789abcdef";
 
+static int nmihook_abort = 0;
+static int nmihook_try[NR_CPUS] = { NMIHOOK_TRY };
 static spinlock_t slavecpulocks[NR_CPUS];
 static atomic_t procindebug[NR_CPUS];
 atomic_t kgdb_setting_breakpoint;
@@ -901,11 +904,14 @@ int kgdb_handle_exception(int ex_vector,
 
        kgdb_disable_hw_debug(linux_regs);
 
-       if (!debugger_step || !kgdb_contthread)
+       if (!debugger_step || !kgdb_contthread) {
+               add_preempt_count(HARDIRQ_OFFSET);
                for (i = 0; i < NR_CPUS; i++)
                        spin_lock(&slavecpulocks[i]);
+       }
 
 #ifdef CONFIG_SMP
+roundup_again:
        /* Make sure we get the other CPUs */
        if (!debugger_step || !kgdb_contthread)
                kgdb_roundup_cpus(flags);
@@ -928,7 +934,16 @@ int kgdb_handle_exception(int ex_vector,
                        all_cpus_synced = 1;
                        break;
                }
+               if (nmihook_abort) {
+                       udelay(1);
+                       nmihook_abort = 0;
+                       break;
+               }
        }
+#ifdef CONFIG_SMP
+       if (!all_cpus_synced)
+               goto roundup_again;
+#endif
 
        /* Clear the out buffer. */
        memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
@@ -1367,6 +1382,7 @@ int kgdb_handle_exception(int ex_vector,
                                        cpu_relax();
                        }
                }
+               sub_preempt_count(IRQ_EXIT_OFFSET);
        }
 
 #ifdef CONFIG_SMP
@@ -1409,8 +1425,21 @@ static struct notifier_block kgdb_module
 void kgdb_nmihook(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
-       if (!atomic_read(&procindebug[cpu]) &&
atomic_read(&debugger_active) != (cpu + 1))
+       if (!atomic_read(&procindebug[cpu]) &&
atomic_read(&debugger_active) != (cpu + 1)) {
+               if (preempt_count() - HARDIRQ_OFFSET && !need_resched()
&&
+                   nmihook_try[cpu]) {
+                       nmihook_try[cpu]--;
+                       nmihook_abort = 1;
+                       return;
+               }
+               if (!nmihook_try[cpu])
+                       printk("CPU#%d falling through to kgdb_wait "
+                              "preempt_count %08lx need_sched %d\n",
cpu,
+                              preempt_count() - HARDIRQ_OFFSET,
+                              need_resched());
+               nmihook_try[cpu] = NMIHOOK_TRY;
                kgdb_wait((struct pt_regs *)regs);
+       }
 #endif
 }
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd62..4c3e7d2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,7 @@
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/delay.h>
+#include <linux/kgdb.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1176,6 +1177,10 @@ void update_process_times(int user_tick)
        struct task_struct *p = current;
        int cpu = smp_processor_id();
 
+#ifdef CONFIG_KGDB
+       if (atomic_read(&cpu_doing_single_step) == cpu)
+               return;
+#endif
        /* Note: this timer irq context must be accounted for as well.
*/
        if (user_tick)
                account_user_time(p, jiffies_to_cputime(1));

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Kgdb-bugreport mailing list
Kgdb-bugreport@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport

Reply via email to