Hi Philippe,

debugging is nice, tracing is still nicer. As you suggested, I extended
the tracer with per-domain stall flags (needs some output clean-up,
preliminary patch attached nevertheless).

And here is the result (full trace attached):

> :|    (0x51) 0x000000c8 -1113+   1.112  __ipipe_sync_stage+0x142 
> (ipipe_suspend_domain+0x56)
> :|    fn                -1112+   1.789  __ipipe_sync_stage+0xe 
> (ipipe_suspend_domain+0x56)
> :|   *(0x50) 0x00000064 -1110+   1.293  __ipipe_sync_stage+0x97 
> (ipipe_suspend_domain+0x56)
> :    *fn                -1109+   1.398  do_IRQ+0x8 (__ipipe_sync_stage+0xcf)
> :    *fn                -1107+   2.105  __do_IRQ+0xc (do_IRQ+0x21)
> :    *fn                -1105+   1.631  handle_IRQ_event+0xd (__do_IRQ+0x9e)
> :    *fn                -1104+   2.413  timer_interrupt+0x9 
> (handle_IRQ_event+0x40)
> :    *fn                -1101+   3.022  mark_offset_tsc+0xe 
> (timer_interrupt+0x31)
> :    *fn                -1098+   2.721  do_timer+0x9 (timer_interrupt+0x37)
> :|   *fn                -1096+   2.112  __ipipe_handle_irq+0xe 
> (common_interrupt+0x18)
> :|   *fn                -1093+   1.210  __ipipe_ack_common_irq+0xc 
> (__ipipe_handle_irq+0xc0)
> :|   *(0x50) 0x00000064 -1092+   1.218  __ipipe_ack_common_irq+0x31 
> (__ipipe_handle_irq+0xc0)
> :|   *fn                -1091+   1.834  mask_and_ack_8259A+0xb 
> (__ipipe_ack_common_irq+0x5d)
> :|   *(0x50) 0x00000064 -1089+   1.345  __ipipe_ack_common_irq+0x9d 
> (__ipipe_handle_irq+0xc0)
> :|   *fn                -1088    0.924  ipipe_suspend_domain+0xb 
> (__ipipe_handle_irq+0x174)
> :|   *(0x51) 0x000000c8 -1087+   1.172  ipipe_suspend_domain+0x26 
> (__ipipe_handle_irq+0x174)
> :|   *fn                -1086+   2.030  __ipipe_sync_stage+0xe 
> (ipipe_suspend_domain+0x56)
> :|  **(0x50) 0x000000c8 -1084+   1.330  __ipipe_sync_stage+0x97 
> (ipipe_suspend_domain+0x56)
> :|  **fn                -1082    0.879  xnintr_clock_handler+0x8 
> (__ipipe_sync_stage+0x12b)
> :|  **fn                -1082+   1.218  xnintr_irq_handler+0xb 
> (xnintr_clock_handler+0x18)
> :|  **fn                -1080+   1.233  xnpod_announce_tick+0x9 
> (xnintr_irq_handler+0x2a)
> :|  **(0x50) 0x000000c8 -1079+   1.736  xnpod_announce_tick+0x20 
> (xnintr_irq_handler+0x2a)
> :|  **fn                -1077+   2.984  xntimer_do_tick_aperiodic+0xe 
> (xnpod_announce_tick+0x36)
> :|  **fn                -1074+   2.751  xnthread_timeout_handler+0x8 
> (xntimer_do_tick_aperiodic+0x18d)
> :|  **fn                -1072+   1.864  xnpod_resume_thread+0xe 
> (xnthread_timeout_handler+0x1d)
> :|  **(0x50) 0x000000c8 -1070+   4.699  xnpod_resume_thread+0x2b 
> (xnthread_timeout_handler+0x1d)
> :|  **(0x50) 0x000000c8 -1065+   1.586  xnpod_resume_thread+0x2bf 
> (xnthread_timeout_handler+0x1d)
> :|  **(0x01) 0x00001237 -1063+   2.661  xntimer_do_tick_aperiodic+0x309 
> (xnpod_announce_tick+0x36)
> :|  **(0x50) 0x000000c8 -1061+   1.263  xnpod_announce_tick+0x4f 
> (xnintr_irq_handler+0x2a)
> :|  **fn                -1060+   1.255  rthal_irq_end+0x8 
> (xnintr_irq_handler+0x46)
> :|  **fn                -1058+   2.007  enable_8259A_irq+0x9 
> (rthal_irq_end+0x2e)
> :|  **fn                -1056+   1.924  xnpod_schedule+0xe 
> (xnintr_irq_handler+0x6e)
> :|  **(0x50) 0x000000c8 -1054!  15.368  xnpod_schedule+0x53 
> (xnintr_irq_handler+0x6e)
> :|  **fn                -1039!  13.300  __switch_to+0xc (xnpod_schedule+0x689)
> :|  **(0x50) 0x000000c8 -1026+   1.766  xnpod_schedule+0x951 
> (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8 -1024!  18.195  xnpod_suspend_thread+0x2bb 
> (rt_task_sleep+0x54)
> :   **fn                -1006+   1.624  __ipipe_syscall_root+0x9 
> (system_call+0x20)
> :   **fn                -1004+   1.406  __ipipe_dispatch_event+0xe 
> (__ipipe_syscall_root+0x58)
> :   **fn                -1003+   4.323  hisyscall_event+0xe 
> (__ipipe_dispatch_event+0x5e)
> :   **fn                 -998+   1.398  __rt_task_sleep+0xa 
> (hisyscall_event+0x23c)
> :   **fn                 -997+   1.789  __copy_from_user_ll+0xa 
> (__rt_task_sleep+0x1a)
> :   **fn                 -995!  15.345  rt_task_sleep+0xa 
> (__rt_task_sleep+0x25)
> :   **fn                 -980    0.879  __ipipe_syscall_root+0x9 
> (system_call+0x20)
> :   **fn                 -979+   1.308  __ipipe_dispatch_event+0xe 
> (__ipipe_syscall_root+0x58)
> :   **fn                 -978+   2.451  hisyscall_event+0xe 
> (__ipipe_dispatch_event+0x5e)
> :   **fn                 -975+   1.631  sys_rtdm_ioctl+0x8 
> (hisyscall_event+0x23c)
> :   **fn                 -974+   1.255  _rtdm_ioctl+0xc (sys_rtdm_ioctl+0x1b)
> :   **fn                 -972+   4.180  rtdm_context_get+0xa 
> (_rtdm_ioctl+0x20)
> :   **fn                 -968+   1.203  __ipipe_syscall_root+0x9 
> (system_call+0x20)
> :   **fn                 -967+   1.345  __ipipe_dispatch_event+0xe 
> (__ipipe_syscall_root+0x58)

The '*' mark a stalled domain, root domain on the right. It seems to me
that xnpod_suspend_thread() leaves the xeno domain stalled on wake up.
This gets recovered somehow later.

But here we see a different effect which finally causes the tick to get
frozen:

> :     fn                 -504+   2.075  sched_clock+0xd (schedule+0x112)
> :     fn                 -502    0.992  __ipipe_stall_root+0x8 
> (schedule+0x18e)
> :    *(0x50) 0x00000064  -501+   4.022  __ipipe_stall_root+0x32 
> (schedule+0x18e)
> :    *fn                 -497+   1.977  __ipipe_dispatch_event+0xe 
> (schedule+0x412)
> :    *fn                 -495+   4.210  schedule_event+0xd 
> (__ipipe_dispatch_event+0x5e)
> :   **(0x50) 0x000000c8  -491+   1.428  schedule_event+0x261 
> (__ipipe_dispatch_event+0x5e)
> :|  **fn                 -490+   2.067  xnpod_schedule_runnable+0xe 
> (schedule_event+0x28b)
> :|  **fn                 -488    0.954  ipipe_unstall_pipeline_from+0xc 
> (schedule_event+0x2c1)
> :|   *(0x51) 0x000000c8  -487+   4.646  ipipe_unstall_pipeline_from+0x25 
> (schedule_event+0x2c1)
> :|   *fn                 -482+   7.248  __switch_to+0xc (xnpod_schedule+0x689)
> :|  **(0x50) 0x000000c8  -475+   1.421  xnpod_schedule+0x77f 
> (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8  -473+   1.481  xnpod_schedule+0x951 
> (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8  -472+   1.654  xnpod_suspend_thread+0x2bb 
> (xnshadow_relax+0x136)
> :|  **(0x50) 0x000000c8  -470+   2.917  xnshadow_relax+0x154 
> (hisyscall_event+0x2ec)
> :|  **fn                 -467+   1.375  ipipe_reenter_root+0xb 
> (xnshadow_relax+0x204)
> :|  **fn                 -466    0.954  __ipipe_unstall_root+0x8 
> (ipipe_reenter_root+0x26)
> :|  * (0x51) 0x00000064  -465+   3.789  __ipipe_unstall_root+0x25 
> (ipipe_reenter_root+0x26)
> :   * fn                 -461+   1.578  send_sig+0x8 (xnshadow_relax+0x228)
> :   * fn                 -460+   1.496  send_sig_info+0xb (send_sig+0x1d)
> :   * fn                 -458    0.909  __ipipe_test_and_stall_root+0x9 
> (send_sig_info+0x38)
> :   **(0x50) 0x00000064  -457+   1.699  __ipipe_test_and_stall_root+0x27 
> (send_sig_info+0x38)
> :   **fn                 -455+   1.203  specific_send_sig_info+0xb 
> (send_sig_info+0x69)
> :   **fn                 -454+   1.706  __ipipe_test_root+0x8 
> (specific_send_sig_info+0x16)
> :   **fn                 -453+   3.360  sig_ignored+0x9 
> (specific_send_sig_info+0x29)
> :   **fn                 -449+   1.714  send_signal+0xb 
> (specific_send_sig_info+0x55)
> :   **fn                 -447+   3.142  __sigqueue_alloc+0x9 
> (send_signal+0x3f)
> :   **fn                 -444+   1.210  kmem_cache_alloc+0xa 
> (__sigqueue_alloc+0x42)
> :   **fn                 -443    0.909  __ipipe_test_and_stall_root+0x9 
> (kmem_cache_alloc+0x12)
> :   **(0x50) 0x00000064  -442+   2.180  __ipipe_test_and_stall_root+0x27 
> (kmem_cache_alloc+0x12)
> :   **fn                 -440+   1.218  __ipipe_restore_root+0x8 
> (kmem_cache_alloc+0x52)
> :   **fn                 -439+   1.315  __ipipe_stall_root+0x8 
> (__ipipe_restore_root+0x11)

No more recovery from the xeno stall, no more timer ticks.

The special traces were generated via

#define ipipe_mark_domain_stall(ipd,cpuid)     \
        ipipe_trace_special(0x50, ipd->priority)
#define ipipe_mark_domain_unstall(ipd,cpuid)   \
        ipipe_trace_special(0x51, ipd->priority)

Any ideas? I do not find anything fishy yet that we may have introduced
to cause this problem.

Jan
Index: linux-2.6.15.5/kernel/ipipe/core.c
===================================================================
--- linux-2.6.15.5.orig/kernel/ipipe/core.c     2006-04-08 13:36:34.000000000 
+0200
+++ linux-2.6.15.5/kernel/ipipe/core.c  2006-04-08 16:39:30.000000000 +0200
@@ -40,7 +40,7 @@
 
 ipipe_spinlock_t __ipipe_pipelock = IPIPE_SPIN_LOCK_UNLOCKED;
 
-struct list_head __ipipe_pipeline;
+LIST_HEAD(__ipipe_pipeline);
 
 unsigned long __ipipe_virtual_irq_map = 0;
 
@@ -66,8 +66,6 @@
         * secondary CPUs are still lost in space.
         */
 
-       INIT_LIST_HEAD(&__ipipe_pipeline);
-
        ipd->name = "Linux";
        ipd->domid = IPIPE_ROOT_ID;
        ipd->priority = IPIPE_ROOT_PRIO;
Index: linux-2.6.15.5/kernel/ipipe/tracer.c
===================================================================
--- linux-2.6.15.5.orig/kernel/ipipe/tracer.c   2006-04-08 13:37:21.000000000 
+0200
+++ linux-2.6.15.5/kernel/ipipe/tracer.c        2006-04-08 16:52:07.000000000 
+0200
@@ -51,6 +51,7 @@
 
 #define IPIPE_TFLG_HWIRQ_OFF        0x0100
 #define IPIPE_TFLG_FREEZING         0x0200
+#define IPIPE_TFLG_DOMSTATE_SHIFT   12   /* bits 12..15: domain stalled? */
 
 
 struct ipipe_trace_point{
@@ -118,6 +119,24 @@
 static void __ipipe_print_symname(struct seq_file *m, unsigned long eip);
 
 
+static notrace void
+__ipipe_store_domain_states(struct ipipe_trace_point *point, int cpu_id)
+{
+       int i = IPIPE_TFLG_DOMSTATE_SHIFT;
+       struct list_head *pos;
+
+       list_for_each_prev(pos, &__ipipe_pipeline) {
+               struct ipipe_domain *ipd =
+                       list_entry(pos, struct ipipe_domain, p_link);
+
+               if (test_bit(IPIPE_STALL_FLAG, &ipd->cpudata[cpu_id].status))
+                       point->flags |= (1 << i);
+
+               if (++i > IPIPE_TFLG_DOMSTATE_SHIFT+3)
+                       break;
+       }
+}
+
 static notrace int __ipipe_get_free_trace_path(int old, int cpu_id)
 {
        int new_active = old;
@@ -282,6 +301,8 @@
        point->v = v;
        ipipe_read_tsc(point->timestamp);
 
+       __ipipe_store_domain_states(point, cpu_id);
+
        /* forward to next point buffer */
        next_pos = WRAP_POINT_NO(pos+1);
        tp->trace_pos = next_pos;
@@ -617,6 +638,7 @@
 {
        char mark = ' ';
        int point_no = point - print_path->point;
+       int i;
 
        if (print_path->end == point_no)
                mark = '<';
@@ -626,6 +648,12 @@
                mark = ':';
        seq_printf(m, "%c%c", mark,
                   (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' ');
+
+       if (verbose_trace)
+               for (i = IPIPE_TFLG_DOMSTATE_SHIFT + 3;
+                    i >= IPIPE_TFLG_DOMSTATE_SHIFT; i--)
+                       seq_printf(m, "%c",
+                                  (point->flags & (1 << i)) ? '*' : ' ');
 }
 
 static void

Attachment: frozen-timer3.bz2
Description: Binary data

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
Xenomai-core mailing list
Xenomai-core@gna.org
https://mail.gna.org/listinfo/xenomai-core

Reply via email to