As I was testing simple golang hello work app to see why it hangs (or does 
not terminate gracefully) as described by issue #850 I discovered number of 
threads waiting to be woken up from futex code calls. Here are the relevant 
thread stack traces (198, 199, 200 are the ones calling futex code):

(gdb) osv thread 12 (named ">init")
(gdb) bt
#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=0xffff800001417040, this@entry=0xbdbdd0) at arch/x64/arch-switch.hh:74
#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, 
    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339
#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228
#3  0x00000000003e5542 in sched::thread::wait 
(this=this@entry=0xffff800001612040) at core/sched.cc:1214
#4  0x000000000036e7a8 in 
sched::thread::do_wait_for<sched::thread::dummy_lock, 
sched::wait_object<osv::acpi_interrupt::process_interrupts()::{lambda()#1}>, 
sched::wait_object<osv::acpi_interrupt::process_interrupts()::{lambda()#2}> 
>(sched::thread::dummy_lock&, 
sched::wait_object<osv::acpi_interrupt::process_interrupts()::{lambda()#1}>&&, 
sched::wait_object<osv::acpi_interrupt::process_interrupts()::{lambda()#2}>&&) 
(mtx=<synthetic pointer>)
    at include/osv/sched.hh:1206
#5  
sched::thread::wait_for<osv::acpi_interrupt::process_interrupts()::{lambda()#1},
 
osv::acpi_interrupt::process_interrupts()::{lambda()#2}>(osv::acpi_interrupt::process_interrupts()::{lambda()#1}&&,
 
osv::acpi_interrupt::process_interrupts()::{lambda()#2}&&) () at 
include/osv/sched.hh:1230
#6  osv::acpi_interrupt::process_interrupts (this=0xffffa00001382380) at 
drivers/acpi.cc:244
#7  osv::acpi_interrupt::acpi_interrupt(unsigned int, unsigned int 
(*)(void*), void*)::{lambda()#1}::operator()() const (__closure=<optimized 
out>)
    at drivers/acpi.cc:228
#8  std::_Function_handler<void (), 
osv::acpi_interrupt::acpi_interrupt(unsigned int, unsigned int (*)(void*), 
void*)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (__functor=...) at 
/usr/include/c++/5/functional:1871
#9  0x00000000003e7157 in sched::thread_main_c (t=0xffff800001612040) at 
arch/x64/arch-switch.hh:193
#10 0x000000000038ab23 in thread_main () at arch/x64/entry.S:113

(gdb) osv thread 13 (named ">init")
(gdb) bt
#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=0xffff800001417040, this@entry=0xffff800001d4d040) at 
arch/x64/arch-switch.hh:74
#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, 
    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339
#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228
#3  0x00000000003e6d72 in sched::thread::wait (this=0xffff800001629040) at 
core/sched.cc:1214
#4  sched::thread::do_wait_until<sched::noninterruptible, lockfree::mutex, 
sched::thread::reaper::reap()::<lambda()> > (pred=..., mtx=...)
    at include/osv/sched.hh:1068
#5  sched::thread::wait_until<sched::thread::reaper::reap()::<lambda()> > 
(pred=..., mtx=...) at include/osv/sched.hh:1086
#6  sched::thread::reaper::reap (this=0xffffa00001382b80) at 
core/sched.cc:1553
#7  0x00000000003e6eec in sched::thread::reaper::<lambda()>::operator() 
(__closure=<optimized out>) at core/sched.cc:1544
#8  std::_Function_handler<void(), 
sched::thread::reaper::reaper()::<lambda()> >::_M_invoke(const 
std::_Any_data &) (__functor=...)
    at /usr/include/c++/5/functional:1871
#9  0x00000000003e7157 in sched::thread_main_c (t=0xffff800001629040) at 
arch/x64/arch-switch.hh:193
#10 0x000000000038ab23 in thread_main () at arch/x64/entry.S:113

(gdb) osv thread 27 (named ">init")
(gdb) bt
#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=0xffff800001417040, this@entry=0xffff8000013c0040) at 
arch/x64/arch-switch.hh:74
#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, 
    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339
#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228
#3  0x00000000003e5542 in sched::thread::wait 
(this=this@entry=0xffff800001d4d040) at core/sched.cc:1214
#4  0x00000000004171f0 in 
sched::thread::do_wait_until<sched::noninterruptible, 
sched::thread::dummy_lock, osv::application::join()::<lambda()> > (
    mtx=<synthetic pointer>, pred=...) at include/osv/sched.hh:1068
#5  sched::thread::wait_until<osv::application::join()::<lambda()> > 
(pred=...) at include/osv/sched.hh:1079
#6  osv::application::join (this=0xffffa00002530810) at core/app.cc:234
#7  0x0000000000213b08 in do_main_thread (_main_args=<optimized out>) at 
loader.cc:481
#8  0x00000000004485d6 in pthread_private::pthread::<lambda()>::operator() 
(__closure=0xffffa0000147ab00) at libc/pthread.cc:114
#9  std::_Function_handler<void(), pthread_private::pthread::pthread(void* 
(*)(void*), void*, sigset_t, const 
pthread_private::thread_attr*)::<lambda()> >::_M_invoke(const 
std::_Any_data &) (__functor=...) at /usr/include/c++/5/functional:1871
#10 0x00000000003e7157 in sched::thread_main_c (t=0xffff800001d4d040) at 
arch/x64/arch-switch.hh:193
#11 0x000000000038ab23 in thread_main () at arch/x64/entry.S:113

*(gdb) osv thread 198 // named ">>>init"*
*(gdb) bt*
*#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=this@entry=0xffff800001417040) at arch/x64/arch-switch.hh:74*
*#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, *
*    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339*
*#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228*
*#3  0x00000000003e5542 in sched::thread::wait 
(this=this@entry=0xffff800002a45040) at core/sched.cc:1214*
*#4  0x000000000040f915 in sched::thread::do_wait_for<lockfree::mutex, 
sched::wait_object<waitqueue> >(lockfree::mutex&, 
sched::wait_object<waitqueue>&&) (mtx=...) at include/osv/sched.hh:1206*
*#5  sched::thread::wait_for<waitqueue&> (mtx=...) at 
include/osv/sched.hh:1222*
*#6  waitqueue::wait (this=this@entry=0xffffa00002983fd0, mtx=...) at 
core/waitqueue.cc:56*
*#7  0x00000000003d7d93 in futex (uaddr=<optimized out>, op=<optimized 
out>, val=<optimized out>, timeout=0x0, uaddr2=<optimized out>, *
*    val3=<optimized out>) at linux.cc:95*
*#8  0x00000000003d81e7 in syscall (number=number@entry=202) at 
linux.cc:368*
*#9  0x00000000003d8c64 in syscall_wrapper (number=202, p1=17592206340496, 
p2=0, p3=0, p4=0, p5=0, p6=0) at linux.cc:429*
*#10 0x000000000038ab98 in syscall_entry () at arch/x64/entry.S:261*
*#11 0x00001000010a1ce3 in runtime.futex () at 
/home/wkozaczuk/tools/go/src/runtime/sys_linux_amd64.s:425*
*Backtrace stopped: previous frame inner to this frame (corrupt stack?)*

*(gdb) osv thread 199 // named ">>>>init"*
*(gdb) bt*
*#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=0xffff800001417040, this@entry=0xffff80000111f040) at 
arch/x64/arch-switch.hh:74*
*#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, *
*    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339*
*#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228*
*#3  0x00000000003e5542 in sched::thread::wait 
(this=this@entry=0xffff800002b42040) at core/sched.cc:1214*
*#4  0x00000000003d7cdd in sched::thread::do_wait_for<lockfree::mutex, 
sched::wait_object<sched::timer>, sched::wait_object<waitqueue> 
>(lockfree::mutex&, sched::wait_object<sched::timer>&&, 
sched::wait_object<waitqueue>&&) (mtx=...) at include/osv/sched.hh:1206*
*#5  sched::thread::wait_for<sched::timer&, waitqueue&> (mtx=...) at 
include/osv/sched.hh:1222*
*#6  futex (uaddr=<optimized out>, op=<optimized out>, val=<optimized out>, 
timeout=<optimized out>, uaddr2=<optimized out>, val3=<optimized out>)*
*    at linux.cc:85*
*#7  0x00000000003d81e7 in syscall (number=number@entry=202) at 
linux.cc:368*
*#8  0x00000000003d8c64 in syscall_wrapper (number=202, p1=17592206338904, 
p2=0, p3=0, p4=35184376675864, p5=0, p6=0) at linux.cc:429*
*#9  0x000000000038ab98 in syscall_entry () at arch/x64/entry.S:261*
*#10 0x00001000010a1ce3 in runtime.futex () at 
/home/wkozaczuk/tools/go/src/runtime/sys_linux_amd64.s:425*
*Backtrace stopped: previous frame inner to this frame (corrupt stack?)*

*(gdb) osv thread 200 // named ">>>>init"*
*(gdb) bt*
*#0  0x00000000003e1aa6 in sched::thread::switch_to 
(this=0xffff800001417040, this@entry=0xffff800002a45040) at 
arch/x64/arch-switch.hh:74*
*#1  0x00000000003e49c2 in sched::cpu::reschedule_from_interrupt 
(this=0xffff800000f2b040, called_from_yield=called_from_yield@entry=false, *
*    preempt_after=..., preempt_after@entry=...) at core/sched.cc:339*
*#2  0x00000000003e4e7c in sched::cpu::schedule () at core/sched.cc:228*
*#3  0x00000000003e5542 in sched::thread::wait 
(this=this@entry=0xffff800002c4d040) at core/sched.cc:1214*
*#4  0x000000000040f915 in sched::thread::do_wait_for<lockfree::mutex, 
sched::wait_object<waitqueue> >(lockfree::mutex&, 
sched::wait_object<waitqueue>&&) (mtx=...) at include/osv/sched.hh:1206*
*#5  sched::thread::wait_for<waitqueue&> (mtx=...) at 
include/osv/sched.hh:1222*
*#6  waitqueue::wait (this=this@entry=0xffffa00002983e50, mtx=...) at 
core/waitqueue.cc:56*
*#7  0x00000000003d7d93 in futex (uaddr=<optimized out>, op=<optimized 
out>, val=<optimized out>, timeout=0x0, uaddr2=<optimized out>, *
*    val3=<optimized out>) at linux.cc:95*
*#8  0x00000000003d81e7 in syscall (number=number@entry=202) at 
linux.cc:368*
*#9  0x00000000003d8c64 in syscall_wrapper (number=202, p1=842350602512, 
p2=0, p3=0, p4=0, p5=0, p6=0) at linux.cc:429*
*#10 0x000000000038ab98 in syscall_entry () at arch/x64/entry.S:261*
*#11 0x00001000010a1ce3 in runtime.futex () at 
/home/wkozaczuk/tools/go/src/runtime/sys_linux_amd64.s:425*
*Backtrace stopped: previous frame inner to this frame (corrupt stack?)*

Here is also the output from the app with some debug statements from futex 
calls:

futex:WAIT from thread: 198, *uaddr=[0], val=[0]
futex:WAKE from thread: 200, *uaddr=[1], val=[1]
futex:WAIT from thread: 200, *uaddr=[0], val=[0]
syscall(): unimplemented system call 267
futex:WAIT from thread: 197, *uaddr=[0], val=[0]
futex:WAKE from thread: 199, *uaddr=[1], val=[1]
futex:WAIT from thread: 198, *uaddr=[0], val=[0]
futex:WAKE from thread: 200, *uaddr=[1], val=[1]
futex:WAIT from thread: 200, *uaddr=[0], val=[0]
futex:WAKE from thread: 197, *uaddr=[1], val=[1]
futex:WAKE from thread: 200, *uaddr=[1], val=[1]
futex:WAIT from thread: 197, *uaddr=[0], val=[0]
futex:WAIT from thread: 200, *uaddr=[0], val=[0]
futex:WAKE from thread: 198, *uaddr=[1], val=[1]
futex:WAIT from thread: 198, *uaddr=[0], val=[0]
Hello, 世界
futex:WAKE from thread: 199, *uaddr=[1], val=[1]
futex:WAIT from thread: 198, *uaddr=[0], val=[0]
Go version: go1.8.3
futex:WAKE from thread: 199, *uaddr=[1], val=[1]
futex:WAIT from thread: 197, *uaddr=[0], val=[0]
futex:WAKE from thread: 198, *uaddr=[1], val=[1]
futex:WAIT from thread: 198, *uaddr=[0], val=[0]
futex:WAKE from thread: 199, *uaddr=[1], val=[1]
futex:WAIT from thread: 198, *uaddr=[0], val=[0]
futex:WAIT from thread: 199, *uaddr=[0], val=[0]
futex:WAIT from thread: 199, *uaddr=[0], val=[0]

As you can see the thread 197 calling futex was actually terminated when I 
connected with gdb.

Here is also the portion of code in linux.cc as it is a little different 
from master. I took it from @benoit go branch and added couple of debug 
statements:

 50 // We don't expect applications to use the Linux futex() system call 
(it is
 51 // normally only used to implement higher-level synchronization 
mechanisms),
 52 // but unfortunately gcc's C++ runtime uses a subset of futex in the
 53 // __cxa__guard_* functions, which safeguard the concurrent 
initialization
 54 // of function-scope static objects. We only implement here this subset.
 55 // The __cxa_guard_* functions only call futex in the rare case of 
contention,
 56 // in fact so rarely that OSv existed for a year before anyone noticed 
futex
 57 // was missing. So the performance of this implementation is not 
critical.
 58 static std::unordered_map<void*, std::shared_ptr<waitqueue>> queues;
 59 static mutex queues_mutex;
 60 enum {
 61     FUTEX_WAIT           = 0,
 62     FUTEX_WAKE           = 1,
 63     FUTEX_PRIVATE_FLAG   = 128,
 64     FUTEX_CLOCK_REALTIME = 256,
 65     FUTEX_CMD_MASK       = ~(FUTEX_PRIVATE_FLAG|FUTEX_CLOCK_REALTIME),
 66 };
 67 
 68 int futex(int *uaddr, int op, int val, const struct timespec *timeout,
 69         int *uaddr2, int val3)
 70 {
 71     switch (op & FUTEX_CMD_MASK) {
 72     case FUTEX_WAIT:
 73         debug_always("futex:WAIT from thread: %d, *uaddr=[%d], 
val=[%d]\n", sched::thread::current()->id(), *uaddr, val);
 74         WITH_LOCK(queues_mutex) {
 75             if (*uaddr == val) {
 76                 auto i = queues.find(uaddr);
 77                 if (i == queues.end()) {
 78                     queues[uaddr] = std::make_shared<waitqueue>();
 79                 }
 80                 std::shared_ptr<waitqueue> q = queues[uaddr];
 81                 if (timeout) {
 82                     sched::timer tmr(*sched::thread::current());
 83                     tmr.set(std::chrono::seconds(timeout->tv_sec) +
 84                             std::chrono::nanoseconds(timeout->tv_nsec));
 85                     sched::thread::wait_for(queues_mutex, tmr, *q);
 86                     // FIXME: testing if tmr was expired isn't quite 
right -
 87                     // we could have had both a wakeup and timer 
expiration
 88                     // racing. It would be more correct to check if we 
were
 89                     // waken by a FUTEX_WAKE. But how?
 90                     if (tmr.expired()) {
 91                         errno = ETIMEDOUT;
 92                         return -1;
 93                     }
 94                 } else {
 95                     q->wait(queues_mutex);
 96                 }
 97                 return 0;
 98             } else {
 99                 errno = EWOULDBLOCK;
100                 return -1;
101             }
102         }
103     case FUTEX_WAKE:
104         debug_always("futex:WAKE from thread: %d, *uaddr=[%d], 
val=[%d]\n", sched::thread::current()->id(), *uaddr, val);
105         if(val < 0) {
106             errno = EINVAL;
107             return -1;
108         }
109 
110         WITH_LOCK(queues_mutex) {
111             auto i = queues.find(uaddr);
112             if (i != queues.end()) {
113                 int waken = 0;
114                 while( (val > waken) && !(i->second->empty()) ) {
115                     i->second->wake_one(queues_mutex);
116                     waken++;
117                 }
118                 if(i->second->empty()) {
119                     queues.erase(i);
120                 }
121                 return waken;
122             }
123         }
124         return 0;
125     default:
126         abort("Unimplemented futex() operation %d\n", op);
127     }
128 }

I cannot see any clear deadlock situation so I am wondering what might be 
going on. Anybody any ideas?

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to