[
https://issues.apache.org/jira/browse/MESOS-3201?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14658802#comment-14658802
]
Joris Van Remoortere commented on MESOS-3201:
---------------------------------------------
Indeed it is.
If you run {{valgrind --tool=helgrind --num-callers=60 ./tests
--gtest_filter="IOTest.Read" --gtest_repeat=10}}
you will likely find this (as well as other locking order violations) in the
output:
{code}
==2083== Thread #10: lock order "0xC911BD8 before 0xC9110C0" violated
==2083==
==2083== Observed (incorrect) order is: acquisition of lock at 0xC9110C0
==2083== at 0x4C33596: pthread_mutex_lock (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x72F4C4: __gthread_mutex_lock(pthread_mutex_t*)
(gthr-default.h:748)
==2083== by 0x753F74: std::mutex::lock() (mutex:135)
==2083== by 0x753F58: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::operator()(std::mutex*)
const (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x753F37: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::__invoke(std::mutex*)
(synchronized.hpp:58)
==2083== by 0x753DA8: Synchronized<std::mutex>::Synchronized(std::mutex*,
void (*)(std::mutex*), void (*)(std::mutex*)) (synchronized.hpp:35)
==2083== by 0x753C7B: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*) (synchronized.hpp:56)
==2083== by 0x7E3C31: process::handle_async(ev_loop*, ev_async*, int)
(libev.cpp:48)
==2083== by 0x827EF4: ev_invoke_pending (ev.c:2994)
==2083== by 0x828A72: ev_run (ev.c:3394)
==2083== by 0x7E41BA: ev_loop(ev_loop*, int) (ev.h:826)
==2083== by 0x7E4133: process::EventLoop::run() (libev.cpp:135)
==2083== by 0x7B31AE: void std::_Bind_simple<void
(*())()>::_M_invoke<>(std::_Index_tuple<>) (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7B3184: std::_Bind_simple<void (*())()>::operator()() (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7B315B: std::thread::_Impl<std::_Bind_simple<void (*())()>
>::_M_run() (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x6371E2F: ??? (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.20)
==2083== by 0x4C31FD6: ??? (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x4E456A9: start_thread (pthread_create.c:333)
==2083== by 0x68E2EEC: clone (clone.S:109)
==2083==
==2083== followed by a later acquisition of lock at 0xC911BD8
==2083== at 0x4C33596: pthread_mutex_lock (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x52DA84: __gthread_mutex_lock(pthread_mutex_t*)
(gthr-default.h:748)
==2083== by 0x52DA54: __gthread_recursive_mutex_lock(pthread_mutex_t*)
(gthr-default.h:810)
==2083== by 0x5AFE04: std::recursive_mutex::lock() (mutex:176)
==2083== by 0x5AFDE8: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::operator()(std::recursive_mutex*)
const (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x5AFDC7: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::__invoke(std::recursive_mutex*)
(synchronized.hpp:58)
==2083== by 0x5AFC2E:
Synchronized<std::recursive_mutex>::Synchronized(std::recursive_mutex*, void
(*)(std::recursive_mutex*), void (*)(std::recursive_mutex*))
(synchronized.hpp:35)
==2083== by 0x5AA70B: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*) (synchronized.hpp:56)
==2083== by 0x75A52C: process::ProcessManager::use(process::UPID const&)
(process.cpp:2136)
==2083== by 0x7694D8: process::ProcessManager::terminate(process::UPID
const&, bool, process::ProcessBase*) (process.cpp:2613)
==2083== by 0x76BF0A: process::terminate(process::UPID const&, bool)
(process.cpp:3147)
==2083== by 0x72C98C: process::Latch::trigger() (latch.cpp:53)
==2083== by 0x41F394:
process::internal::awaited(process::Owned<process::Latch>) (future.hpp:1001)
==2083== by 0x48F903: void std::_Bind<void
(*(process::Owned<process::Latch>))(process::Owned<process::Latch>)>::__call<void,
process::Future<unsigned long> const&,
0ul>(std::tuple<process::Future<unsigned long> const&>&&,
std::_Index_tuple<0ul>) (functional:1263)
==2083== by 0x48F88C: void std::_Bind<void
(*(process::Owned<process::Latch>))(process::Owned<process::Latch>)>::operator()<process::Future<unsigned
long> const&, void>(process::Future<unsigned long> const&) (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x48F841: std::_Function_handler<void (process::Future<unsigned
long> const&), std::_Bind<void
(*(process::Owned<process::Latch>))(process::Owned<process::Latch>)>
>::_M_invoke(std::_Any_data const&, process::Future<unsigned long> const&)
(functional:2039)
==2083== by 0x7249D7: std::function<void (process::Future<unsigned long>
const&)>::operator()(process::Future<unsigned long> const&) const (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x72B9CB: void process::internal::run<std::function<void
(process::Future<unsigned long> const&)>, process::Future<unsigned
long>&>(std::vector<std::function<void (process::Future<unsigned long>
const&)>, std::allocator<std::function<void (process::Future<unsigned long>
const&)> > > const&, process::Future<unsigned long>&) (future.hpp:444)
==2083== by 0x72C2EC: process::Promise<unsigned
long>::discard(process::Future<unsigned long>) (future.hpp:807)
==2083== by 0x71E712: process::Promise<unsigned long>::discard()
(future.hpp:604)
==2083== by 0x7179EE: process::io::internal::read(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&) (io.cpp:45)
==2083== by 0x72A527: void std::_Bind<void (*(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> >, std::_Placeholder<1>))(int,
void*, unsigned long, std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>::__call<void, process::Future<short> const&,
0ul, 1ul, 2ul, 3ul, 4ul>(std::tuple<process::Future<short> const&>&&,
std::_Index_tuple<0ul, 1ul, 2ul, 3ul, 4ul>) (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x72A41C: void std::_Bind<void (*(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> >, std::_Placeholder<1>))(int,
void*, unsigned long, std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>::operator()<process::Future<short> const&,
void>(process::Future<short> const&) (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x72A3CC: process::Future<short> const&
process::Future<short>::onAny<std::_Bind<void (*(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> >, std::_Placeholder<1>))(int,
void*, unsigned long, std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>, void>(std::_Bind<void (*(int, void*, unsigned
long, std::shared_ptr<process::Promise<unsigned long> >,
std::_Placeholder<1>))(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>&&, process::Future<short>::Prefer)
const::{lambda(process::Future<short>
const&)#1}::operator()(process::Future<short> const&) (future.hpp:257)
==2083== by 0x72A181: std::_Function_handler<void (process::Future<short>
const&), process::Future<short> const&
process::Future<short>::onAny<std::_Bind<void (*(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> >, std::_Placeholder<1>))(int,
void*, unsigned long, std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>, void>(std::_Bind<void (*(int, void*, unsigned
long, std::shared_ptr<process::Promise<unsigned long> >,
std::_Placeholder<1>))(int, void*, unsigned long,
std::shared_ptr<process::Promise<unsigned long> > const&,
process::Future<short> const&)>&&, process::Future<short>::Prefer)
const::{lambda(process::Future<short> const&)#1}>::_M_invoke(std::_Any_data
const&, process::Future<short> const&) (functional:2039)
==2083== by 0x727AF7: std::function<void (process::Future<short>
const&)>::operator()(process::Future<short> const&) const (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x72779B: void process::internal::run<std::function<void
(process::Future<short> const&)>,
process::Future<short>&>(std::vector<std::function<void (process::Future<short>
const&)>, std::allocator<std::function<void (process::Future<short> const&)> >
> const&, process::Future<short>&) (future.hpp:444)
==2083== by 0x7EBFFC:
process::Promise<short>::discard(process::Future<short>) (future.hpp:807)
==2083== by 0x7E9332: process::Promise<short>::discard() (future.hpp:604)
==2083== by 0x7EA445: void
process::_run_in_event_loop<short>(std::function<process::Future<short> ()>
const&, process::Owned<process::Promise<short> > const&) (libev.hpp:66)
==2083== by 0x7EAD83: void std::_Bind<void
(*(std::function<process::Future<short> ()>,
process::Owned<process::Promise<short> >))(std::function<process::Future<short>
()> const&, process::Owned<process::Promise<short> > const&)>::__call<void, ,
0ul, 1ul>(std::tuple<>&&, std::_Index_tuple<0ul, 1ul>) (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7EAD05: void std::_Bind<void
(*(std::function<process::Future<short> ()>,
process::Owned<process::Promise<short> >))(std::function<process::Future<short>
()> const&, process::Owned<process::Promise<short> > const&)>::operator()<,
void>() (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7EA7AC: std::_Function_handler<void (), std::_Bind<void
(*(std::function<process::Future<short> ()>,
process::Owned<process::Promise<short> >))(std::function<process::Future<short>
()> const&, process::Owned<process::Promise<short> > const&)>
>::_M_invoke(std::_Any_data const&) (functional:2039)
==2083== by 0x490BBD: std::function<void ()>::operator()() const (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7E3D39: process::handle_async(ev_loop*, ev_async*, int)
(libev.cpp:57)
==2083== by 0x827EF4: ev_invoke_pending (ev.c:2994)
==2083== by 0x828A72: ev_run (ev.c:3394)
==2083== by 0x7E41BA: ev_loop(ev_loop*, int) (ev.h:826)
==2083== by 0x7E4133: process::EventLoop::run() (libev.cpp:135)
==2083== by 0x7B31AE: void std::_Bind_simple<void
(*())()>::_M_invoke<>(std::_Index_tuple<>) (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7B3184: std::_Bind_simple<void (*())()>::operator()() (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x7B315B: std::thread::_Impl<std::_Bind_simple<void (*())()>
>::_M_run() (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x6371E2F: ??? (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.20)
==2083== by 0x4C31FD6: ??? (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x4E456A9: start_thread (pthread_create.c:333)
==2083== by 0x68E2EEC: clone (clone.S:109)
==2083==
==2083== Lock at 0xC911BD8 was first observed
==2083== at 0x4C33596: pthread_mutex_lock (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x52DA84: __gthread_mutex_lock(pthread_mutex_t*)
(gthr-default.h:748)
==2083== by 0x52DA54: __gthread_recursive_mutex_lock(pthread_mutex_t*)
(gthr-default.h:810)
==2083== by 0x5AFE04: std::recursive_mutex::lock() (mutex:176)
==2083== by 0x5AFDE8: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::operator()(std::recursive_mutex*)
const (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x5AFDC7: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::__invoke(std::recursive_mutex*)
(synchronized.hpp:58)
==2083== by 0x5AFC2E:
Synchronized<std::recursive_mutex>::Synchronized(std::recursive_mutex*, void
(*)(std::recursive_mutex*), void (*)(std::recursive_mutex*))
(synchronized.hpp:35)
==2083== by 0x5AA70B: Synchronized<std::recursive_mutex>
synchronize<std::recursive_mutex>(std::recursive_mutex*) (synchronized.hpp:56)
==2083== by 0x7683CF: process::ProcessManager::spawn(process::ProcessBase*,
bool) (process.cpp:2354)
==2083== by 0x76BE7D: process::spawn(process::ProcessBase*, bool)
(process.cpp:3138)
==2083== by 0x76F138: process::PID<process::GarbageCollector>
process::spawn<process::GarbageCollector>(process::GarbageCollector*, bool)
(process.hpp:412)
==2083== by 0x75BBE9: process::initialize(std::string const&)
(process.cpp:939)
==2083== by 0x4A6580: main (main.cpp:35)
==2083== Address 0xc911bd8 is 56 bytes inside a block of size 272 alloc'd
==2083== at 0x4C2D520: operator new(unsigned long) (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x75AB4D: process::initialize(std::string const&)
(process.cpp:809)
==2083== by 0x4A6580: main (main.cpp:35)
==2083== Block was alloc'd by thread #1
==2083==
==2083== Lock at 0xC9110C0 was first observed
==2083== at 0x4C33596: pthread_mutex_lock (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x72F4C4: __gthread_mutex_lock(pthread_mutex_t*)
(gthr-default.h:748)
==2083== by 0x753F74: std::mutex::lock() (mutex:135)
==2083== by 0x753F58: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::operator()(std::mutex*)
const (in /mesos/build/3rdparty/libprocess/tests)
==2083== by 0x753F37: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::__invoke(std::mutex*)
(synchronized.hpp:58)
==2083== by 0x753DA8: Synchronized<std::mutex>::Synchronized(std::mutex*,
void (*)(std::mutex*), void (*)(std::mutex*)) (synchronized.hpp:35)
==2083== by 0x753C7B: Synchronized<std::mutex>
synchronize<std::mutex>(std::mutex*) (synchronized.hpp:56)
==2083== by 0x7E9770: process::Future<short>
process::run_in_event_loop<short>(std::function<process::Future<short> ()>
const&) (libev.hpp:87)
==2083== by 0x7E8FAD: process::io::poll(int, short) (libev_poll.cpp:140)
==2083== by 0x7FFEF2: process::network::PollSocketImpl::accept()
(poll_socket.cpp:92)
==2083== by 0x541117: process::network::Socket::accept() (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x75BB32: process::initialize(std::string const&)
(process.cpp:931)
==2083== by 0x4A6580: main (main.cpp:35)
==2083== Address 0xc9110c0 is 0 bytes inside a block of size 40 alloc'd
==2083== at 0x4C2D520: operator new(unsigned long) (in
/usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==2083== by 0x412B93: __cxx_global_var_init2 (libev.cpp:38)
==2083== by 0x412C92: _GLOBAL__sub_I_libev.stdout.XXX.2523.skMZhZ.ii
(libev.stdout.XXX.2523.skMZhZ.ii:0)
==2083== by 0x82D70C: __libc_csu_init (in
/mesos/build/3rdparty/libprocess/tests)
==2083== by 0x67FC9CE: (below main) (libc-start.c:245)
==2083== Block was alloc'd by thread #1
{code}
After applying this review I can see no more locking violations.
https://reviews.apache.org/r/37089/
> Libev handle_async can deadlock with run_in_event_loop
> ------------------------------------------------------
>
> Key: MESOS-3201
> URL: https://issues.apache.org/jira/browse/MESOS-3201
> Project: Mesos
> Issue Type: Bug
> Components: libprocess
> Affects Versions: 0.23.0
> Reporter: Joris Van Remoortere
> Assignee: Joris Van Remoortere
> Labels: libprocess, mesosphere
>
> Due to the arbitrary nature of the functions that are executed in
> handle_async, invoking them under the (A) {{watchers_mutex}} can lead to
> deadlocks if (B) is acquired before calling {{run_in_event_loop}} and (B) is
> also acquired within the arbitrary function.
> {code}
> ==82679== Thread #10: lock order "0x60774F8 before 0x60768C0" violated
> ==82679==
> ==82679== Observed (incorrect) order is: acquisition of lock at 0x60768C0
> ==82679== at 0x4C32145: pthread_mutex_lock (in
> /usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
> ==82679== by 0x692C9B: __gthread_mutex_lock(pthread_mutex_t*)
> (gthr-default.h:748)
> ==82679== by 0x6950BF: std::mutex::lock() (mutex:134)
> ==82679== by 0x696219: Synchronized<std::mutex>
> synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::operator()(std::mutex*)
> const (synchronized.hpp:58)
> ==82679== by 0x696238: Synchronized<std::mutex>
> synchronize<std::mutex>(std::mutex*)::{lambda(std::mutex*)#1}::_FUN(std::mutex*)
> (synchronized.hpp:58)
> ==82679== by 0x6984CF: Synchronized<std::mutex>::Synchronized(std::mutex*,
> void (*)(std::mutex*), void (*)(std::mutex*)) (synchronized.hpp:35)
> ==82679== by 0x6962DE: Synchronized<std::mutex>
> synchronize<std::mutex>(std::mutex*) (synchronized.hpp:60)
> ==82679== by 0x728FE1: process::handle_async(ev_loop*, ev_async*, int)
> (libev.cpp:48)
> ==82679== by 0x761384: ev_invoke_pending (ev.c:2994)
> ==82679== by 0x7643C4: ev_run (ev.c:3394)
> ==82679== by 0x728E37: ev_loop (ev.h:826)
> ==82679== by 0x729469: process::EventLoop::run() (libev.cpp:135)
> ==82679==
> ==82679== followed by a later acquisition of lock at 0x60774F8
> ==82679== at 0x4C32145: pthread_mutex_lock (in
> /usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
> ==82679== by 0x4C6F9D: __gthread_mutex_lock(pthread_mutex_t*)
> (gthr-default.h:748)
> ==82679== by 0x4C6FED: __gthread_recursive_mutex_lock(pthread_mutex_t*)
> (gthr-default.h:810)
> ==82679== by 0x4F5D3D: std::recursive_mutex::lock() (mutex:175)
> ==82679== by 0x516513: Synchronized<std::recursive_mutex>
> synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::operator()(std::recursive_mutex*)
> const (synchronized.hpp:58)
> ==82679== by 0x516532: Synchronized<std::recursive_mutex>
> synchronize<std::recursive_mutex>(std::recursive_mutex*)::{lambda(std::recursive_mutex*)#1}::_FUN(std::recursive_mutex*)
> (synchronized.hpp:58)
> ==82679== by 0x52E619:
> Synchronized<std::recursive_mutex>::Synchronized(std::recursive_mutex*, void
> (*)(std::recursive_mutex*), void (*)(std::recursive_mutex*))
> (synchronized.hpp:35)
> ==82679== by 0x5165D4: Synchronized<std::recursive_mutex>
> synchronize<std::recursive_mutex>(std::recursive_mutex*) (synchronized.hpp:60)
> ==82679== by 0x6BF4E1: process::ProcessManager::use(process::UPID const&)
> (process.cpp:2127)
> ==82679== by 0x6C2B8C: process::ProcessManager::terminate(process::UPID
> const&, bool, process::ProcessBase*) (process.cpp:2604)
> ==82679== by 0x6C6C3C: process::terminate(process::UPID const&, bool)
> (process.cpp:3107)
> ==82679== by 0x692B65: process::Latch::trigger() (latch.cpp:53)
> {code}
> This was introduced in
> https://github.com/apache/mesos/commit/849fc4d361e40062073324153ba97e98e294fdf2
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)