Matthew Dillon wrote:
:I've managed to get gdb attached and get some information.
:
:The kernel is getting caught in a while loop in lwkt_acquire. I can :reliably trigger this with with a "make -j8 buildworld" under a SMP :kernel (Otherwise identical to GENERIC, no optimisations.) The OS is :completely unresponsive and all four cpu cores are running at 100%.
:
:I've included the debug information.
:
:Program received signal SIGINT, Interrupt.
:lwkt_acquire (td=0xc6a59e70) at /usr/src/sys/kern/lwkt_thread.c:1048
:1048           while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))
:(gdb) l
:1043       mygd = mycpu;
:1044       if (gd != mycpu) {
:1045           cpu_lfence();
:1046           KKASSERT((td->td_flags & TDF_RUNQ) == 0);
:1047           crit_enter_gd(mygd);
:1048           while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))
:1049               cpu_lfence();
:1050           td->td_gd = mygd;
:1051           TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
:1052           td->td_flags &= ~TDF_MIGRATING;
:(gdb) p td->td_flags
:$1 = 8390177
:(gdb) p td
:$2 = (thread_t) 0xc6a59e70
:(gdb) bt
:#0  lwkt_acquire (td=0xc6a59e70) at /usr/src/sys/kern/lwkt_thread.c:1048
:#1 0xc02c66af in bsd4_select_curproc (gd=0xff800000) at :/usr/src/sys/kern/usched_bsd4.c:358 :#2 0xc02c6829 in bsd4_release_curproc (lp=0xea634c00) at :/usr/src/sys/kern/usched_bsd4.c:322 :#3 0xc04b8239 in passive_release (td=0xdfe8aba0) at :..
:
:Continuing execution does not appear to cause any problems.
:I can provide additional debugging info if required but I'm unsure of :how to proceed with this myself.
:
:Regards
:
:Gary

    This is great info.  One thing, what do you mean by 'KVM'?  What
    is DragonFly running under, exactly?

    I think once I understand the environment I may be able to figure out
    why the acquisition loop is blowing up.

                                        -Matt
Matthew Dillon <[EMAIL PROTECTED]>


I've triggered another lock up and this time got a different trace. Execution appears to be looping indefinitely inside LWKT code.

Debugging gives the output below. Again all four core are running at 100%.


Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67a7000, frame=0x0)
    at /usr/src/sys/kern/lwkt_ipiq.c:522
522         while (wi - (ri = ip->ip_rindex) > 0) {
(gdb) l
517          *
518          * Note: due to additional IPI operations that the callback function
519 * may make, it is possible for both rindex and windex to advance and
520          * thus for rindex to advance passed our cached windex.
521          */
522         while (wi - (ri = ip->ip_rindex) > 0) {
523             ri &= MAXCPUFIFO_MASK;
524             copy_func = ip->ip_func[ri];
525             copy_arg1 = ip->ip_arg1[ri];
526             copy_arg2 = ip->ip_arg2[ri];
(gdb) p wi
$5 = 356278
(gdb) p ip->ip_rindex
$6 = 356278
(gdb) bt
#0  lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67a7000,
    frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
#1  0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000, frame=0x0)
    at /usr/src/sys/kern/lwkt_ipiq.c:558
558         return(wi != ip->ip_windex);
(gdb) l
553          * ipiq.  ip_npoll is left set as long as possible to reduce the
554          * number of IPIs queued by the originating cpu, but must be cleared
555          * *BEFORE* checking windex.
556          */
557         atomic_poll_release_int(&ip->ip_npoll);
558         return(wi != ip->ip_windex);
559     }
560     
561     static void
562     lwkt_sync_ipiq(void *arg)
(gdb) p wi
$7 = 357733
(gdb) p ip->ip_windex
$8 = 357733
(gdb) bt
#0  lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000,
    frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1  0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67b1000, frame=0x0)
    at /usr/src/sys/kern/lwkt_ipiq.c:558
558         return(wi != ip->ip_windex);
(gdb) p wi
$9 = 372884
(gdb) p  ip->ip_windex
$10 = 372884
(gdb) bt
#0  lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67b1000,
    frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1  0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
0xc02c93a5 in lwkt_process_ipiq_core (sgd=<value optimized out>,
    ip=0xff8001e8, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
522         while (wi - (ri = ip->ip_rindex) > 0) {
(gdb) p wi
$11 = 1343383299
(gdb) p ip->ip_rindex
$12 = 1343383298
(gdb) bt
#0  0xc02c93a5 in lwkt_process_ipiq_core (sgd=<value optimized out>,
    ip=0xff8001e8, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:522
#1  0xc02c94df in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:458
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
0xc02c9495 in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
452                     while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], 
NULL))
(gdb) l
447         for (n = 0; n < ncpus; ++n) {
448             if (n != gd->gd_cpuid) {
449                 sgd = globaldata_find(n);
450                 ip = sgd->gd_ipiq;
451                 if (ip != NULL) {
452                     while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], 
NULL))
453                         ;
454                 }
455             }
456         }
(gdb) bt
#0  0xc02c9495 in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#1 0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000, Cannot access memory at address 0x8
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#2  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#3  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#4  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#5  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#6  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#7  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
0xc02c944c in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=<value optimized out>, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:559
559     }
(gdb) l
554          * number of IPIs queued by the originating cpu, but must be cleared
555          * *BEFORE* checking windex.
556          */
557         atomic_poll_release_int(&ip->ip_npoll);
558         return(wi != ip->ip_windex);
559     }
560     
561     static void
562     lwkt_sync_ipiq(void *arg)
563     {
(gdb) p wi
$13 = 357733
(gdb) p ip->ip_windex
Cannot access memory at address 0x8
(gdb) bt
#0  0xc02c944c in lwkt_process_ipiq_core (sgd=<value optimized out>,
ip=<value optimized out>, frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:559
#1  0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.


Program received signal SIGINT, Interrupt.
lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000, frame=0x0)
    at /usr/src/sys/kern/lwkt_ipiq.c:558
558         return(wi != ip->ip_windex);
(gdb) l
553          * ipiq.  ip_npoll is left set as long as possible to reduce the
554          * number of IPIs queued by the originating cpu, but must be cleared
555          * *BEFORE* checking windex.
556          */
557         atomic_poll_release_int(&ip->ip_npoll);
558         return(wi != ip->ip_windex);
559     }
560     
561     static void
562     lwkt_sync_ipiq(void *arg)
(gdb) p wi
$24 = 357733
(gdb) p  ip->ip_windex
$25 = 357733
(gdb) bt
#0  lwkt_process_ipiq_core (sgd=<value optimized out>, ip=0xc67ac000,
    frame=0x0) at /usr/src/sys/kern/lwkt_ipiq.c:558
#1  0xc02c94ad in lwkt_process_ipiq () at /usr/src/sys/kern/lwkt_ipiq.c:452
#2  0xc02c9830 in lwkt_send_ipiq3 (target=0xff808000,
    func=0xc02c8519 <lwkt_schedule>, arg1=0xc0600170, arg2=0)
    at /usr/src/sys/kern/lwkt_ipiq.c:185
#3  0xc02c863c in lwkt_schedule (td=0xc0600170)
    at /usr/src/sys/sys/thread2.h:244
#4  0xc02c71e6 in bsd4_setrunqueue (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:551
#5  0xc02c72be in bsd4_acquire_curproc (lp=0xe5f8b400)
    at /usr/src/sys/kern/usched_bsd4.c:271
#6  0xc04b9603 in syscall2 (frame=0xe5a02d40)
    at /usr/src/sys/platform/pc32/i386/trap.c:349
#7  0xc04a3396 in Xint0x80_syscall ()
    at /usr/src/sys/platform/pc32/i386/exception.s:876
#8  0xe5a02d40 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
(gdb) c
Continuing.




Reply via email to