[
https://issues.apache.org/jira/browse/MESOS-2254?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Marco Massenzio updated MESOS-2254:
-----------------------------------
Description:
With more than 20 executors running on a slave with the posix isolator, we have
seen a very high cpu load (over 200%).
>From profiling one thread (there were two, taking up all the cpu time. The
>total CPU time was over 200%):
{code}
Running Time Self Symbol Name
27133.0ms 47.8% 0.0 _pthread_body 0x1adb50
27133.0ms 47.8% 0.0 thread_start
27133.0ms 47.8% 0.0 _pthread_start
27133.0ms 47.8% 0.0 _pthread_body
27133.0ms 47.8% 0.0 process::schedule(void*)
27133.0ms 47.8% 2.0
process::ProcessManager::resume(process::ProcessBase*)
27126.0ms 47.8% 1.0
process::ProcessBase::serve(process::Event const&)
27125.0ms 47.8% 0.0
process::DispatchEvent::visit(process::EventVisitor*) const
27125.0ms 47.8% 0.0
process::ProcessBase::visit(process::DispatchEvent const&)
27125.0ms 47.8% 0.0 std::__1::function<void
(process::ProcessBase*)>::operator()(process::ProcessBase*) const
27124.0ms 47.8% 0.0
std::__1::__function::__func<process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*),
std::__1::allocator<process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*)>, void
(process::ProcessBase*)>::operator()(process::ProcessBase*&&)
27124.0ms 47.8% 1.0
process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*)::operator()(process::ProcessBase*)
const
27060.0ms 47.7% 1.0
mesos::internal::slave::PosixCpuIsolatorProcess::usage(mesos::ContainerID
const&)
27046.0ms 47.7% 2.0
mesos::internal::usage(int, bool, bool)
27023.0ms 47.6% 2.0 os::pstree(Option<int>)
26748.0ms 47.1% 23.0 os::processes()
24809.0ms 43.7% 349.0 os::process(int)
8199.0ms 14.4% 47.0 os::sysctl::string()
const
7562.0ms 13.3% 7562.0 __sysctl
{code}
We could see that usage() in usage/usage.cpp is causing this.
was:
With more than 20 executors running on a slave with the posix isolator, we have
seen an very high cpu load (over 200%).
>From profiling one thread (there were two, taking up all the cpu time. The
>total CPU time was over 200%):
{code}
Running Time Self Symbol Name
27133.0ms 47.8% 0.0 _pthread_body 0x1adb50
27133.0ms 47.8% 0.0 thread_start
27133.0ms 47.8% 0.0 _pthread_start
27133.0ms 47.8% 0.0 _pthread_body
27133.0ms 47.8% 0.0 process::schedule(void*)
27133.0ms 47.8% 2.0
process::ProcessManager::resume(process::ProcessBase*)
27126.0ms 47.8% 1.0
process::ProcessBase::serve(process::Event const&)
27125.0ms 47.8% 0.0
process::DispatchEvent::visit(process::EventVisitor*) const
27125.0ms 47.8% 0.0
process::ProcessBase::visit(process::DispatchEvent const&)
27125.0ms 47.8% 0.0 std::__1::function<void
(process::ProcessBase*)>::operator()(process::ProcessBase*) const
27124.0ms 47.8% 0.0
std::__1::__function::__func<process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*),
std::__1::allocator<process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*)>, void
(process::ProcessBase*)>::operator()(process::ProcessBase*&&)
27124.0ms 47.8% 1.0
process::Future<mesos::ResourceStatistics>
process::dispatch<mesos::ResourceStatistics,
mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
const&, process::Future<mesos::ResourceStatistics>
(mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
mesos::ContainerID)::'lambda'(process::ProcessBase*)::operator()(process::ProcessBase*)
const
27060.0ms 47.7% 1.0
mesos::internal::slave::PosixCpuIsolatorProcess::usage(mesos::ContainerID
const&)
27046.0ms 47.7% 2.0
mesos::internal::usage(int, bool, bool)
27023.0ms 47.6% 2.0 os::pstree(Option<int>)
26748.0ms 47.1% 23.0 os::processes()
24809.0ms 43.7% 349.0 os::process(int)
8199.0ms 14.4% 47.0 os::sysctl::string()
const
7562.0ms 13.3% 7562.0 __sysctl
{code}
We could see that usage() in usage/usage.cpp is causing this.
> Posix CPU isolator usage call introduce high cpu load
> -----------------------------------------------------
>
> Key: MESOS-2254
> URL: https://issues.apache.org/jira/browse/MESOS-2254
> Project: Mesos
> Issue Type: Bug
> Reporter: Niklas Quarfot Nielsen
>
> With more than 20 executors running on a slave with the posix isolator, we
> have seen a very high cpu load (over 200%).
> From profiling one thread (there were two, taking up all the cpu time. The
> total CPU time was over 200%):
> {code}
> Running Time Self Symbol Name
> 27133.0ms 47.8% 0.0 _pthread_body 0x1adb50
> 27133.0ms 47.8% 0.0 thread_start
> 27133.0ms 47.8% 0.0 _pthread_start
> 27133.0ms 47.8% 0.0 _pthread_body
> 27133.0ms 47.8% 0.0 process::schedule(void*)
> 27133.0ms 47.8% 2.0
> process::ProcessManager::resume(process::ProcessBase*)
> 27126.0ms 47.8% 1.0
> process::ProcessBase::serve(process::Event const&)
> 27125.0ms 47.8% 0.0
> process::DispatchEvent::visit(process::EventVisitor*) const
> 27125.0ms 47.8% 0.0
> process::ProcessBase::visit(process::DispatchEvent const&)
> 27125.0ms 47.8% 0.0 std::__1::function<void
> (process::ProcessBase*)>::operator()(process::ProcessBase*) const
> 27124.0ms 47.8% 0.0
> std::__1::__function::__func<process::Future<mesos::ResourceStatistics>
> process::dispatch<mesos::ResourceStatistics,
> mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
> mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
> const&, process::Future<mesos::ResourceStatistics>
> (mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
> mesos::ContainerID)::'lambda'(process::ProcessBase*),
> std::__1::allocator<process::Future<mesos::ResourceStatistics>
> process::dispatch<mesos::ResourceStatistics,
> mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
> mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
> const&, process::Future<mesos::ResourceStatistics>
> (mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
> mesos::ContainerID)::'lambda'(process::ProcessBase*)>, void
> (process::ProcessBase*)>::operator()(process::ProcessBase*&&)
> 27124.0ms 47.8% 1.0
> process::Future<mesos::ResourceStatistics>
> process::dispatch<mesos::ResourceStatistics,
> mesos::internal::slave::IsolatorProcess, mesos::ContainerID const&,
> mesos::ContainerID>(process::PID<mesos::internal::slave::IsolatorProcess>
> const&, process::Future<mesos::ResourceStatistics>
> (mesos::internal::slave::IsolatorProcess::*)(mesos::ContainerID const&),
> mesos::ContainerID)::'lambda'(process::ProcessBase*)::operator()(process::ProcessBase*)
> const
> 27060.0ms 47.7% 1.0
> mesos::internal::slave::PosixCpuIsolatorProcess::usage(mesos::ContainerID
> const&)
> 27046.0ms 47.7% 2.0
> mesos::internal::usage(int, bool, bool)
> 27023.0ms 47.6% 2.0 os::pstree(Option<int>)
> 26748.0ms 47.1% 23.0 os::processes()
> 24809.0ms 43.7% 349.0 os::process(int)
> 8199.0ms 14.4% 47.0 os::sysctl::string()
> const
> 7562.0ms 13.3% 7562.0 __sysctl
> {code}
> We could see that usage() in usage/usage.cpp is causing this.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)