Repository: mesos Updated Branches: refs/heads/master 406d72ed0 -> 61e116dd6
Added slave metric to count container launch failures. Review: https://reviews.apache.org/r/35738 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/61e116dd Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/61e116dd Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/61e116dd Branch: refs/heads/master Commit: 61e116dd668c8a5bf2bba12fd908ff2e9c445d23 Parents: 406d72e Author: Paul Brett <[email protected]> Authored: Tue Jun 23 13:02:04 2015 -0700 Committer: Jie Yu <[email protected]> Committed: Tue Jun 23 14:31:02 2015 -0700 ---------------------------------------------------------------------- src/slave/metrics.cpp | 8 +++++- src/slave/metrics.hpp | 2 ++ src/slave/slave.cpp | 4 ++- src/tests/containerizer.cpp | 33 ++++++++++----------- src/tests/metrics_tests.cpp | 2 ++ src/tests/slave_tests.cpp | 62 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 93 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.cpp ---------------------------------------------------------------------- diff --git a/src/slave/metrics.cpp b/src/slave/metrics.cpp index 87289f2..ae3a53e 100644 --- a/src/slave/metrics.cpp +++ b/src/slave/metrics.cpp @@ -84,7 +84,9 @@ Metrics::Metrics(const Slave& slave) "slave/invalid_framework_messages"), executor_directory_max_allowed_age_secs( "slave/executor_directory_max_allowed_age_secs", - defer(slave, &Slave::_executor_directory_max_allowed_age_secs)) + defer(slave, &Slave::_executor_directory_max_allowed_age_secs)), + container_launch_errors( + "slave/container_launch_errors") { // TODO(dhamon): Check return values for metric registration. process::metrics::add(uptime_secs); @@ -115,6 +117,8 @@ Metrics::Metrics(const Slave& slave) process::metrics::add(executor_directory_max_allowed_age_secs); + process::metrics::add(container_launch_errors); + // Create resource gauges. // TODO(dhamon): Set these up dynamically when creating a slave // based on the resources it exposes. @@ -197,6 +201,8 @@ Metrics::~Metrics() process::metrics::remove(executor_directory_max_allowed_age_secs); + process::metrics::remove(container_launch_errors); + foreach (const Gauge& gauge, resources_total) { process::metrics::remove(gauge); } http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.hpp ---------------------------------------------------------------------- diff --git a/src/slave/metrics.hpp b/src/slave/metrics.hpp index 780c438..43c8662 100644 --- a/src/slave/metrics.hpp +++ b/src/slave/metrics.hpp @@ -65,6 +65,8 @@ struct Metrics process::metrics::Gauge executor_directory_max_allowed_age_secs; + process::metrics::Counter container_launch_errors; + // Non-revocable resources. std::vector<process::metrics::Gauge> resources_total; std::vector<process::metrics::Gauge> resources_used; http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/slave.cpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 946e372..b3e1ccc 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -3197,8 +3197,9 @@ void Slave::executorLaunched( << "' failed to start: " << (future.isFailed() ? future.failure() : " future discarded"); - containerizer->destroy(containerId); + ++metrics.container_launch_errors; + containerizer->destroy(containerId); return; } else if (!future.get()) { LOG(ERROR) << "Container '" << containerId @@ -3208,6 +3209,7 @@ void Slave::executorLaunched( << flags.containerizers << ") could create a container for the " << "provided TaskInfo/ExecutorInfo message."; + ++metrics.container_launch_errors; return; } http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/containerizer.cpp ---------------------------------------------------------------------- diff --git a/src/tests/containerizer.cpp b/src/tests/containerizer.cpp index 80b9105..68c6f98 100644 --- a/src/tests/containerizer.cpp +++ b/src/tests/containerizer.cpp @@ -182,7 +182,7 @@ void TestContainerizer::destroy( std::pair<FrameworkID, ExecutorID> key(frameworkId, executorId); if (!containers_.contains(key)) { LOG(WARNING) << "Ignoring destroy of unknown container for executor '" - << executorId << "' of framework '" << frameworkId << "'"; + << executorId << "' of framework " << frameworkId; return; } destroy(containers_[key]); @@ -191,21 +191,22 @@ void TestContainerizer::destroy( void TestContainerizer::destroy(const ContainerID& containerId) { - CHECK(drivers.contains(containerId)) - << "Failed to terminate container " << containerId - << " because it is has not been started"; - - Owned<MesosExecutorDriver> driver = drivers[containerId]; - driver->stop(); - driver->join(); - drivers.erase(containerId); - - containerizer::Termination termination; - termination.set_killed(false); - termination.set_message("Killed executor"); - termination.set_status(0); - promises[containerId]->set(termination); - promises.erase(containerId); + if (drivers.contains(containerId)) { + Owned<MesosExecutorDriver> driver = drivers[containerId]; + driver->stop(); + driver->join(); + drivers.erase(containerId); + } + + if (promises.contains(containerId)) { + containerizer::Termination termination; + termination.set_killed(false); + termination.set_message("Killed executor"); + termination.set_status(0); + + promises[containerId]->set(termination); + promises.erase(containerId); + } } http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/metrics_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/metrics_tests.cpp b/src/tests/metrics_tests.cpp index 6896727..3e9d7c2 100644 --- a/src/tests/metrics_tests.cpp +++ b/src/tests/metrics_tests.cpp @@ -201,6 +201,8 @@ TEST_F(MetricsTest, Slave) EXPECT_EQ(1u, stats.values.count("slave/valid_framework_messages")); EXPECT_EQ(1u, stats.values.count("slave/invalid_framework_messages")); + EXPECT_EQ(1u, stats.values.count("slave/container_launch_errors")); + EXPECT_EQ(1u, stats.values.count("slave/cpus_total")); EXPECT_EQ(1u, stats.values.count("slave/cpus_used")); EXPECT_EQ(1u, stats.values.count("slave/cpus_percent")); http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/slave_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 5030198..027da84 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -911,6 +911,8 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint) 1u, snapshot.values.count("slave/executor_directory_max_allowed_age_secs")); + EXPECT_EQ(1u, snapshot.values.count("slave/container_launch_errors")); + EXPECT_EQ(1u, snapshot.values.count("slave/cpus_total")); EXPECT_EQ(1u, snapshot.values.count("slave/cpus_used")); EXPECT_EQ(1u, snapshot.values.count("slave/cpus_percent")); @@ -927,6 +929,66 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint) } +// Test to verify that we increment the container launch errors metric +// when we fail to launch a container. +TEST_F(SlaveTest, MetricsSlaveLaunchErrors) +{ + // Start a master. + Try<PID<Master>> master = StartMaster(); + ASSERT_SOME(master); + + TestContainerizer containerizer; + + // Start a slave. + Try<PID<Slave>> slave = StartSlave(&containerizer); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(_, _, _)); + + Future<vector<Offer>> offers; + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(offers); + EXPECT_NE(0u, offers.get().size()); + const Offer offer = offers.get()[0]; + + // Verify that we start with no launch failures. + JSON::Object snapshot = Metrics(); + EXPECT_EQ(0, snapshot.values["slave/container_launch_errors"]); + + EXPECT_CALL(containerizer, launch(_, _, _, _, _, _, _)) + .WillOnce(Return(Failure("Injected failure"))); + + EXPECT_CALL(sched, statusUpdate(&driver, _)); + + // Try to start a task + TaskInfo task = createTask( + offer.slave_id(), + Resources::parse("cpus:1;mem:32").get(), + "sleep 1000", + DEFAULT_EXECUTOR_ID); + + driver.launchTasks(offer.id(), {task}); + + // After failure injection, metrics should report a single failure. + snapshot = Metrics(); + EXPECT_EQ(1, snapshot.values["slave/container_launch_errors"]); + + driver.stop(); + driver.join(); + + Shutdown(); +} + + TEST_F(SlaveTest, StateEndpoint) { Try<PID<Master>> master = StartMaster();
