With the current AP bring up code Xen can get stuck indefinitely if an AP
freezes during boot after the 'callin' step.  Introduce a 10s timeout while
waiting for APs to finish startup.

On failure of an AP to complete startup send an NMI to trigger the printing
of a stack backtrace on the stuck AP and panic on the BSP.

The sending of the NMI re-uses the code already present in fatal_trap(), by
moving it to a separate function.

Signed-off-by: Roger Pau Monné <roger....@citrix.com>
---
 xen/arch/x86/include/asm/processor.h |  1 +
 xen/arch/x86/smpboot.c               |  8 ++++
 xen/arch/x86/traps.c                 | 66 +++++++++++++++++-----------
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/xen/arch/x86/include/asm/processor.h 
b/xen/arch/x86/include/asm/processor.h
index eacd425c5350..10d8078cc1ca 100644
--- a/xen/arch/x86/include/asm/processor.h
+++ b/xen/arch/x86/include/asm/processor.h
@@ -371,6 +371,7 @@ void show_registers(const struct cpu_user_regs *regs);
 #define dump_execution_state() run_in_exception_handler(show_execution_state)
 void show_page_walk(unsigned long addr);
 void noreturn fatal_trap(const struct cpu_user_regs *regs, bool show_remote);
+void show_execution_state_nmi(const cpumask_t *mask, bool show_all);
 
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 48ce996ba414..77dce3e3e22b 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -1370,6 +1370,7 @@ int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t 
pxm)
 int __cpu_up(unsigned int cpu)
 {
     int apicid, ret;
+    s_time_t start;
 
     if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
         return -ENODEV;
@@ -1388,10 +1389,17 @@ int __cpu_up(unsigned int cpu)
     time_latch_stamps();
 
     set_cpu_state(CPU_STATE_ONLINE);
+    start = NOW();
     while ( !cpu_online(cpu) )
     {
         cpu_relax();
         process_pending_softirqs();
+        if ( NOW() > start + SECONDS(10) )
+        {
+            /* AP is stuck, send NMI and panic. */
+            show_execution_state_nmi(cpumask_of(cpu), true);
+            panic("CPU%u: Stuck while starting up\n", cpu);
+        }
     }
 
     return 0;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index c94779b4ad4f..9b9e3726e2fb 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -714,13 +714,15 @@ static cpumask_t show_state_mask;
 static bool opt_show_all;
 boolean_param("async-show-all", opt_show_all);
 
+static bool force_show_all;
+
 static int cf_check nmi_show_execution_state(
     const struct cpu_user_regs *regs, int cpu)
 {
     if ( !cpumask_test_cpu(cpu, &show_state_mask) )
         return 0;
 
-    if ( opt_show_all )
+    if ( opt_show_all || force_show_all )
         show_execution_state(regs);
     else if ( guest_mode(regs) )
         printk(XENLOG_ERR "CPU%d\t%pv\t%04x:%p in guest\n",
@@ -734,6 +736,40 @@ static int cf_check nmi_show_execution_state(
     return 1;
 }
 
+void show_execution_state_nmi(const cpumask_t *mask, bool show_all)
+{
+    unsigned int msecs, pending;
+
+    force_show_all = show_all;
+
+    watchdog_disable();
+    console_start_sync();
+
+    cpumask_copy(&show_state_mask, mask);
+    set_nmi_callback(nmi_show_execution_state);
+    /* Ensure new callback is set before sending out the NMI. */
+    smp_wmb();
+    send_IPI_mask(mask, APIC_DM_NMI);
+
+    /* Wait at most 10ms for some other CPU to respond. */
+    msecs = 10;
+    pending = cpumask_weight(&show_state_mask);
+    while ( pending && msecs-- )
+    {
+        unsigned int left;
+
+        mdelay(1);
+        left = cpumask_weight(&show_state_mask);
+        if ( left < pending )
+        {
+            pending = left;
+            msecs = 10;
+        }
+    }
+    if ( pending )
+        printk("Non-responding CPUs: {%*pbl}\n", CPUMASK_PR(&show_state_mask));
+}
+
 const char *vector_name(unsigned int vec)
 {
     static const char names[][4] = {
@@ -780,33 +816,11 @@ void fatal_trap(const struct cpu_user_regs *regs, bool 
show_remote)
 
         if ( show_remote )
         {
-            unsigned int msecs, pending;
+            cpumask_t *scratch = this_cpu(scratch_cpumask);
 
-            cpumask_andnot(&show_state_mask, &cpu_online_map,
+            cpumask_andnot(scratch, &cpu_online_map,
                            cpumask_of(smp_processor_id()));
-            set_nmi_callback(nmi_show_execution_state);
-            /* Ensure new callback is set before sending out the NMI. */
-            smp_wmb();
-            smp_send_nmi_allbutself();
-
-            /* Wait at most 10ms for some other CPU to respond. */
-            msecs = 10;
-            pending = cpumask_weight(&show_state_mask);
-            while ( pending && msecs-- )
-            {
-                unsigned int left;
-
-                mdelay(1);
-                left = cpumask_weight(&show_state_mask);
-                if ( left < pending )
-                {
-                    pending = left;
-                    msecs = 10;
-                }
-            }
-            if ( pending )
-                printk("Non-responding CPUs: {%*pbl}\n",
-                       CPUMASK_PR(&show_state_mask));
+            show_execution_state_nmi(scratch, false);
         }
     }
 
-- 
2.49.0


Reply via email to