git commit: Exposed recovery errors that are encountered in non-strict recovery mode.
Updated Branches: refs/heads/master c014cfb2d - 2b86e3906 Exposed recovery errors that are encountered in non-strict recovery mode. Review: https://reviews.apache.org/r/13789 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2b86e390 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2b86e390 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2b86e390 Branch: refs/heads/master Commit: 2b86e3906854bd3afa352e4b623eecfc2145466b Parents: c014cfb Author: Vinod Kone vi...@twitter.com Authored: Fri Aug 23 17:30:58 2013 -0700 Committer: Vinod Kone vi...@twitter.com Committed: Mon Aug 26 12:11:59 2013 -0400 -- src/slave/http.cpp | 2 +- src/slave/slave.cpp | 8 +++- src/slave/slave.hpp | 3 +++ src/slave/state.cpp | 34 -- src/slave/state.hpp | 22 -- 5 files changed, 55 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/http.cpp -- diff --git a/src/slave/http.cpp b/src/slave/http.cpp index 073d092..62fbb37 100644 --- a/src/slave/http.cpp +++ b/src/slave/http.cpp @@ -289,7 +289,7 @@ FutureResponse Slave::Http::stats(const Request request) object.values[valid_status_updates] = slave.stats.validStatusUpdates; object.values[invalid_status_updates] = slave.stats.invalidStatusUpdates; object.values[registered] = slave.master ? 1 : 0; - + object.values[recovery_errors] = slave.recoveryErrors; return OK(object, request.query.get(jsonp)); } http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.cpp -- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index dd599a7..3e2c600 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -90,7 +90,8 @@ Slave::Slave(const slave::Flags _flags, files(_files), monitor(_isolator), statusUpdateManager(new StatusUpdateManager()), -metaDir(paths::getMetaRootDir(flags.work_dir)) {} +metaDir(paths::getMetaRootDir(flags.work_dir)), +recoveryErrors(0) {} Slave::~Slave() @@ -2666,6 +2667,11 @@ FutureNothing Slave::recover(bool reconnect, bool strict) info = state.get().info.get(); // Recover the slave info. + recoveryErrors = state.get().errors; + if (recoveryErrors 0) { +LOG(WARNING) Errors encountered during recovery: recoveryErrors; + } + // First, recover the frameworks and executors. foreachvalue (const FrameworkState frameworkState, state.get().frameworks) { recoverFramework(frameworkState); http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.hpp -- diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp index aca7f03..ce2b0da 100644 --- a/src/slave/slave.hpp +++ b/src/slave/slave.hpp @@ -334,6 +334,9 @@ private: // Root meta directory containing checkpointed data. const std::string metaDir; + + // Indicates the number of errors ignored in --no-strict recovery mode. + unsigned int recoveryErrors; }; http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/state.cpp -- diff --git a/src/slave/state.cpp b/src/slave/state.cpp index b66aaa3..51bc8f4 100644 --- a/src/slave/state.cpp +++ b/src/slave/state.cpp @@ -86,6 +86,7 @@ TrySlaveState SlaveState::recover( return Error(message); } else { LOG(WARNING) message; + state.errors++; return state; } } @@ -115,6 +116,7 @@ TrySlaveState SlaveState::recover( } state.frameworks[frameworkId] = framework.get(); +state.errors += framework.get().errors; } return state; @@ -148,13 +150,14 @@ TryFrameworkState FrameworkState::recover( message = Failed to read framework info from ' + path + ': + (frameworkInfo.isError() ? frameworkInfo.error() : none); - if (strict) { -return Error(message); - } else { -LOG(WARNING) message; -return state; - } +if (strict) { + return Error(message); +} else { + LOG(WARNING) message; + state.errors++; + return state; } + } state.info = frameworkInfo.get(); @@ -177,6 +180,7 @@ TryFrameworkState FrameworkState::recover( return Error(message); } else { LOG(WARNING) message; + state.errors++; return state; } } @@ -207,6 +211,7 @@ TryFrameworkState FrameworkState::recover( } state.executors[executorId] = executor.get(); +state.errors += executor.get().errors; } return state; @@ -246,6 +251,7 @@ TryExecutorState ExecutorState::recover(
[2/2] git commit: Added SlaveRecoveryTest.MultipleFrameworks test.
Added SlaveRecoveryTest.MultipleFrameworks test. From: Jiang Yan Xu y...@jxu.me Review: https://reviews.apache.org/r/13786 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a866c1a Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a866c1a Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a866c1a Branch: refs/heads/master Commit: 9a866c1a05be10d821df8b569f2af3722986745b Parents: 9a9daa4 Author: Vinod Kone vi...@twitter.com Authored: Mon Aug 26 12:18:20 2013 -0400 Committer: Vinod Kone vi...@twitter.com Committed: Mon Aug 26 12:18:20 2013 -0400 -- src/tests/slave_recovery_tests.cpp | 155 1 file changed, 155 insertions(+) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/9a866c1a/src/tests/slave_recovery_tests.cpp -- diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp index c2c3ce0..c0f130e 100644 --- a/src/tests/slave_recovery_tests.cpp +++ b/src/tests/slave_recovery_tests.cpp @@ -2105,3 +2105,158 @@ TYPED_TEST(SlaveRecoveryTest, MasterFailover) this-Shutdown(); // Shutdown before isolator(s) get deallocated. } + + +// In this test there are two frameworks and one slave. Each +// framework launches a task before the slave goes down. We verify +// that the two frameworks and their tasks are recovered after the +// slave restarts. +TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks) +{ + TryPIDMaster master = this-StartMaster(); + ASSERT_SOME(master); + + TypeParam isolator1; + + slave::Flags flags = this-CreateSlaveFlags(); + + TryPIDSlave slave = this-StartSlave(isolator1, flags); + ASSERT_SOME(slave); + + // Framework 1. + MockScheduler sched1; + + // Enable checkpointing for the framework. + FrameworkInfo frameworkInfo1; + frameworkInfo1.CopyFrom(DEFAULT_FRAMEWORK_INFO); + frameworkInfo1.set_checkpoint(true); + + MesosSchedulerDriver driver1(sched1, frameworkInfo1, master.get()); + + EXPECT_CALL(sched1, registered(_, _, _)); + + FuturevectorOffer offers1; + EXPECT_CALL(sched1, resourceOffers(_, _)) +.WillOnce(FutureArg1(offers1)); + + driver1.start(); + + AWAIT_READY(offers1); + EXPECT_NE(0u, offers1.get().size()); + + // Use part of the resources in the offer so that the rest can be + // offered to framework 2. + Offer offer1 = offers1.get()[0]; + offer1.mutable_resources()-CopyFrom( + Resources::parse(cpus:1;mem:512).get()); + + // Framework 1 launches a task. + TaskInfo task1 = createTask(offer1, sleep 1000); + vectorTaskInfo tasks1; + tasks1.push_back(task1); // Long-running task + + EXPECT_CALL(sched1, statusUpdate(_, _)); + + FutureNothing _statusUpdateAcknowledgement1 = +FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement); + + driver1.launchTasks(offer1.id(), tasks1); + + // Wait for the ACK to be checkpointed. + AWAIT_READY(_statusUpdateAcknowledgement1); + + // Framework 2. + MockScheduler sched2; + + // Enable checkpointing for the framework. + FrameworkInfo frameworkInfo2; + frameworkInfo2.CopyFrom(DEFAULT_FRAMEWORK_INFO); + frameworkInfo2.set_checkpoint(true); + + MesosSchedulerDriver driver2(sched2, frameworkInfo2, master.get()); + + EXPECT_CALL(sched2, registered(_, _, _)); + + FuturevectorOffer offers2; + EXPECT_CALL(sched2, resourceOffers(_, _)) +.WillOnce(FutureArg1(offers2)); + + driver2.start(); + + AWAIT_READY(offers2); + EXPECT_NE(0u, offers2.get().size()); + + // Framework 2 launches a task. + TaskInfo task2 = createTask(offers2.get()[0], sleep 1000); + + vectorTaskInfo tasks2; + tasks2.push_back(task2); // Long-running task + + EXPECT_CALL(sched2, statusUpdate(_, _)); + + FutureNothing _statusUpdateAcknowledgement2 = +FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement); + driver2.launchTasks(offers2.get()[0].id(), tasks2); + + // Wait for the ACK to be checkpointed. + AWAIT_READY(_statusUpdateAcknowledgement2); + + this-Stop(slave.get()); + + FutureNothing _recover = FUTURE_DISPATCH(_, Slave::_recover); + + FutureReregisterSlaveMessage reregisterSlaveMessage = +FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _); + + // Restart the slave (use same flags) with a new isolator. + TypeParam isolator2; + + slave = this-StartSlave(isolator2, flags); + ASSERT_SOME(slave); + + Clock::pause(); + + AWAIT_READY(_recover); + + Clock::settle(); // Wait for slave to schedule reregister timeout. + + Clock::advance(EXECUTOR_REREGISTER_TIMEOUT); + + // Wait for the slave to re-register. + AWAIT_READY(reregisterSlaveMessage); + + Clock::resume(); + + // Expectations for the status changes as a result of killing the + // tasks. + FutureTaskStatus status1; + EXPECT_CALL(sched1, statusUpdate(_, _)) +.WillOnce(FutureArg1(status1)) +
[1/2] git commit: Added SlaveRecoveryTest.MasterFailover test.
Updated Branches: refs/heads/master 2b86e3906 - 9a866c1a0 Added SlaveRecoveryTest.MasterFailover test. From: Jiang Yan Xu y...@jxu.me Review: https://reviews.apache.org/r/13747 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a9daa4c Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a9daa4c Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a9daa4c Branch: refs/heads/master Commit: 9a9daa4c7c0dd3031d84f20d22718c300a234bc2 Parents: 2b86e39 Author: Vinod Kone vi...@twitter.com Authored: Mon Aug 26 12:17:27 2013 -0400 Committer: Vinod Kone vi...@twitter.com Committed: Mon Aug 26 12:17:27 2013 -0400 -- src/master/master.cpp | 4 + src/tests/slave_recovery_tests.cpp | 135 2 files changed, 139 insertions(+) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/9a9daa4c/src/master/master.cpp -- diff --git a/src/master/master.cpp b/src/master/master.cpp index 511332e..dc155ba 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -979,6 +979,10 @@ void Master::killTask(const FrameworkID frameworkId, update-set_uuid(UUID::random().toBytes()); send(framework-pid, message); } + } else { +LOG(WARNING) Failed to kill task taskId + of framework frameworkId + because the framework cannot be found; } } http://git-wip-us.apache.org/repos/asf/mesos/blob/9a9daa4c/src/tests/slave_recovery_tests.cpp -- diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp index 548e8c0..c2c3ce0 100644 --- a/src/tests/slave_recovery_tests.cpp +++ b/src/tests/slave_recovery_tests.cpp @@ -1970,3 +1970,138 @@ TYPED_TEST(SlaveRecoveryTest, PartitionedSlave) this-Shutdown(); // Shutdown before isolator(s) get deallocated. } + + +// This test verifies that if the master changes when the slave is +// down, the slave can still recover the task when it restarts. We +// verify its correctness by killing the task from the scheduler. +TYPED_TEST(SlaveRecoveryTest, MasterFailover) +{ + // Step 1. Run a task. + TryPIDMaster master = this-StartMaster(); + ASSERT_SOME(master); + + TypeParam isolator1; + + slave::Flags flags = this-CreateSlaveFlags(); + + TryPIDSlave slave = this-StartSlave(isolator1, flags); + ASSERT_SOME(slave); + + MockScheduler sched; + + // Enable checkpointing for the framework. + FrameworkInfo frameworkInfo; + frameworkInfo.CopyFrom(DEFAULT_FRAMEWORK_INFO); + frameworkInfo.set_checkpoint(true); + + MesosSchedulerDriver driver(sched, frameworkInfo, master.get()); + + EXPECT_CALL(sched, registered(_, _, _)); + + FuturevectorOffer offers1; + EXPECT_CALL(sched, resourceOffers(_, _)) +.WillOnce(FutureArg1(offers1)); + + Futureprocess::Message frameworkRegisteredMessage = +FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); + + driver.start(); + + AWAIT_READY(frameworkRegisteredMessage); + AWAIT_READY(offers1); + EXPECT_NE(0u, offers1.get().size()); + + TaskInfo task = createTask(offers1.get()[0], sleep 1000); + vectorTaskInfo tasks; + tasks.push_back(task); // Long-running task + + EXPECT_CALL(sched, statusUpdate(_, _)); + + FutureNothing _statusUpdateAcknowledgement = +FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement); + + driver.launchTasks(offers1.get()[0].id(), tasks); + + // Wait for the ACK to be checkpointed. + AWAIT_READY(_statusUpdateAcknowledgement); + + this-Stop(slave.get()); + + // Step 2. Simulate failed over master by restarting the master. + this-Stop(master.get()); + master = this-StartMaster(); + ASSERT_SOME(master); + + FutureNothing registered; + EXPECT_CALL(sched, registered(driver, _, _)) +.WillOnce(FutureSatisfy(registered)); + + // Simulate a new master detected message to the scheduler. + NewMasterDetectedMessage newMasterDetectedMsg; + newMasterDetectedMsg.set_pid(master.get()); + + process::post(frameworkRegisteredMessage.get().to, newMasterDetectedMsg); + + // Framework should get a registered callback. + AWAIT_READY(registered); + + // Step 3. Restart the slave and kill the task. + FutureNothing _recover = FUTURE_DISPATCH(_, Slave::_recover); + + FutureReregisterSlaveMessage reregisterSlaveMessage = +FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _); + + // Restart the slave (use same flags) with a new isolator. + TypeParam isolator2; + + slave = this-StartSlave(isolator2, flags); + ASSERT_SOME(slave); + + Clock::pause(); + + AWAIT_READY(_recover); + + Clock::settle(); // Wait for slave to schedule reregister timeout. + + Clock::advance(EXECUTOR_REREGISTER_TIMEOUT); + + // Wait for the
git commit: Added a slave recovery test for the case of multiple slaves.
Updated Branches: refs/heads/master 9a866c1a0 - f00832a1d Added a slave recovery test for the case of multiple slaves. From: Jie Yu yujie@gmail.com Review: https://reviews.apache.org/r/13753 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f00832a1 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f00832a1 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f00832a1 Branch: refs/heads/master Commit: f00832a1d8cb164cc1e2a12a89565dc36b10d048 Parents: 9a866c1 Author: Vinod Kone vi...@twitter.com Authored: Mon Aug 26 15:38:19 2013 -0400 Committer: Vinod Kone vi...@twitter.com Committed: Mon Aug 26 15:38:19 2013 -0400 -- src/tests/slave_recovery_tests.cpp | 182 1 file changed, 182 insertions(+) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/f00832a1/src/tests/slave_recovery_tests.cpp -- diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp index c0f130e..0735dba 100644 --- a/src/tests/slave_recovery_tests.cpp +++ b/src/tests/slave_recovery_tests.cpp @@ -2260,3 +2260,185 @@ TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks) this-Shutdown(); // Shutdown before isolator(s) get deallocated. } + + +// Create a test fixture for those slave recovery tests that only work +// when ProcessIsolator is used. +// TODO(jieyu): We use typed test here because it magically allows us +// to access protected members in Slave (e.g. Slave::_recover). +template typename T +class SlaveRecoveryProcessIsolatorTest : public IsolatorTestT +{ +public: + virtual slave::Flags CreateSlaveFlags() + { +slave::Flags flags = IsolatorTestT::CreateSlaveFlags(); + +// Setup recovery slave flags. +flags.checkpoint = true; +flags.recover = reconnect; +flags.strict = true; + +return flags; + } +}; + + +TYPED_TEST_CASE(SlaveRecoveryProcessIsolatorTest, +::testing::TypesProcessIsolator); + + +// This test verifies that slave recovery works properly even if +// multiple slaves are co-located on the same host. +TYPED_TEST(SlaveRecoveryProcessIsolatorTest, MultipleSlaves) +{ + TryPIDMaster master = this-StartMaster(); + ASSERT_SOME(master); + + // Enable checkpointing for the framework. + FrameworkInfo frameworkInfo; + frameworkInfo.CopyFrom(DEFAULT_FRAMEWORK_INFO); + frameworkInfo.set_checkpoint(true); + + MockScheduler sched; + MesosSchedulerDriver driver(sched, frameworkInfo, master.get()); + + EXPECT_CALL(sched, registered(_, _, _)); + + driver.start(); + + FuturevectorOffer offers1; + EXPECT_CALL(sched, resourceOffers(driver, _)) +.WillOnce(FutureArg1(offers1)); + + // Start the first slave. + slave::Flags flags1 = this-CreateSlaveFlags(); + slave::ProcessIsolator isolator1; + + TryPIDSlave slave1 = this-StartSlave(isolator1, flags1); + ASSERT_SOME(slave1); + + AWAIT_READY(offers1); + ASSERT_EQ(1u, offers1.get().size()); + + // Launch a long running task in the first slave. + TaskInfo task1 = createTask(offers1.get()[0], sleep 1000); + vectorTaskInfo tasks1; + tasks1.push_back(task1); + + EXPECT_CALL(sched, statusUpdate(_, _)) +.Times(1); + + FutureNothing _statusUpdateAcknowledgement1 = +FUTURE_DISPATCH(slave1.get(), Slave::_statusUpdateAcknowledgement); + + driver.launchTasks(offers1.get()[0].id(), tasks1); + + // Wait for the ACK to be checkpointed. + AWAIT_READY(_statusUpdateAcknowledgement1); + + FuturevectorOffer offers2; + EXPECT_CALL(sched, resourceOffers(driver, _)) +.WillOnce(FutureArg1(offers2)); + + // Start the second slave. + slave::Flags flags2 = this-CreateSlaveFlags(); + slave::ProcessIsolator isolator2; + + TryPIDSlave slave2 = this-StartSlave(isolator2, flags2); + ASSERT_SOME(slave2); + + AWAIT_READY(offers2); + ASSERT_EQ(1u, offers2.get().size()); + + // Launch a long running task in each slave. + TaskInfo task2 = createTask(offers2.get()[0], sleep 1000); + vectorTaskInfo tasks2; + tasks2.push_back(task2); + + EXPECT_CALL(sched, statusUpdate(_, _)) +.Times(1); + + FutureNothing _statusUpdateAcknowledgement2 = +FUTURE_DISPATCH(slave2.get(), Slave::_statusUpdateAcknowledgement); + + driver.launchTasks(offers2.get()[0].id(), tasks2); + + // Wait for the ACKs to be checkpointed. + AWAIT_READY(_statusUpdateAcknowledgement2); + + this-Stop(slave1.get()); + this-Stop(slave2.get()); + + FutureNothing _recover1 = FUTURE_DISPATCH(_, Slave::_recover); + FutureNothing _recover2 = FUTURE_DISPATCH(_, Slave::_recover); + + FutureReregisterSlaveMessage reregisterSlave1 = +FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _); + FutureReregisterSlaveMessage reregisterSlave2 = +FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _); + + // Restart both slaves
git commit: Added a recovery timeout for executor driver self-termination.
Updated Branches: refs/heads/master f00832a1d - 5eb50d6e8 Added a recovery timeout for executor driver self-termination. Review: https://reviews.apache.org/r/13791 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5eb50d6e Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5eb50d6e Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5eb50d6e Branch: refs/heads/master Commit: 5eb50d6e8da1f311de30ca5f1218bb5bcba2236c Parents: f00832a Author: Benjamin Mahler bmah...@twitter.com Authored: Fri Aug 23 19:13:30 2013 -0700 Committer: Benjamin Mahler bmah...@twitter.com Committed: Mon Aug 26 12:52:07 2013 -0700 -- src/exec/exec.cpp | 64 +-- src/launcher/launcher.cpp | 10 +++- src/launcher/launcher.hpp | 7 ++- src/launcher/main.cpp | 26 - src/slave/cgroups_isolator.cpp | 3 +- src/slave/constants.cpp| 1 + src/slave/constants.hpp| 1 + src/slave/flags.hpp| 9 src/slave/process_isolator.cpp | 3 +- src/tests/slave_recovery_tests.cpp | 93 + 10 files changed, 206 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/5eb50d6e/src/exec/exec.cpp -- diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index ca61892..d370560 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -106,7 +106,8 @@ public: const ExecutorID _executorId, bool _local, const string _directory, - bool _checkpoint) + bool _checkpoint, + Duration _recoveryTimeout) : ProcessBase(ID::generate(executor)), slave(_slave), driver(_driver), @@ -115,10 +116,12 @@ public: frameworkId(_frameworkId), executorId(_executorId), connected(false), + connection(UUID::random()), local(_local), aborted(false), directory(_directory), - checkpoint(_checkpoint) + checkpoint(_checkpoint), + recoveryTimeout(_recoveryTimeout) { installExecutorRegisteredMessage( ExecutorProcess::registered, @@ -195,6 +198,7 @@ protected: VLOG(1) Executor registered on slave slaveId; connected = true; +connection = UUID::random(); Stopwatch stopwatch; if (FLAGS_v = 1) { @@ -216,6 +220,9 @@ protected: VLOG(1) Executor re-registered on slave slaveId; +connected = true; +connection = UUID::random(); + Stopwatch stopwatch; if (FLAGS_v = 1) { stopwatch.start(); @@ -391,6 +398,23 @@ protected: aborted = true; } + void _recoveryTimeout(UUID _connection) + { +// If we're connected, no need to shut down the driver! +if (connected) { + return; +} + +// We need to compare the connections here to ensure there have +// not been any subsequent re-registrations with the slave in the +// interim. +if (connection == _connection) { + VLOG(1) Recovery timeout of recoveryTimeout exceeded; + Shutting down; + shutdown(); +} + } + virtual void exited(const UPID pid) { if (aborted) { @@ -402,13 +426,21 @@ protected: // successfully registered with the slave, the slave can reconnect with // this executor when it comes back up and performs recovery! if (checkpoint connected) { + connected = false; + VLOG(1) Slave exited, but framework has checkpointing enabled. - Waiting to reconnect with slave slaveId; + Waiting recoveryTimeout to reconnect with slave + slaveId; + + delay(recoveryTimeout, self(), Self::_recoveryTimeout, connection); + return; } VLOG(1) Slave exited ... shutting down; +connected = false; + if (!local) { // Start the Shutdown Process. spawn(new ShutdownProcess(), true); @@ -494,10 +526,12 @@ private: FrameworkID frameworkId; ExecutorID executorId; bool connected; // Registered with the slave. + UUID connection; // UUID to identify the connection instance. bool local; bool aborted; const string directory; bool checkpoint; + Duration recoveryTimeout; LinkedHashMapUUID, StatusUpdate updates; // Unacknowledged updates. @@ -587,7 +621,7 @@ Status MesosExecutorDriver::start() value = os::getenv(MESOS_SLAVE_PID); slave = UPID(value); if (!slave) { -fatal(cannot parse MESOS_SLAVE_PID); +fatal(Cannot parse MESOS_SLAVE_PID '%s', value.c_str()); } // Get slave ID from environment. @@ -615,6 +649,25 @@ Status MesosExecutorDriver::start() checkpoint = false; } + Duration
Git Push Summary
Updated Tags: refs/tags/0.14.0-rc2 [created] 2346f6de4
[3/3] git commit: Upgraded ZooKeeper from 3.3.4 to 3.3.6.
Upgraded ZooKeeper from 3.3.4 to 3.3.6. From: Vinson Lee v...@twitter.com Review: https://reviews.apache.org/r/13598 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/eb1cd4a7 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/eb1cd4a7 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/eb1cd4a7 Branch: refs/heads/master Commit: eb1cd4a7c0ad4310f090d4f0643cf4059ac5246b Parents: 56a54f5 Author: Benjamin Mahler bmah...@twitter.com Authored: Mon Aug 26 18:26:07 2013 -0700 Committer: Benjamin Mahler bmah...@twitter.com Committed: Mon Aug 26 18:26:55 2013 -0700 -- 3rdparty/versions.am| 2 +- 3rdparty/zookeeper-3.3.4.tar.gz | Bin 13543276 - 0 bytes 3rdparty/zookeeper-3.3.6.tar.gz | Bin 0 - 11833706 bytes src/python/setup.py.in | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/versions.am -- diff --git a/3rdparty/versions.am b/3rdparty/versions.am index 5932e1f..5e0b9f8 100644 --- a/3rdparty/versions.am +++ b/3rdparty/versions.am @@ -21,4 +21,4 @@ BOTO_VERSION = 2.0b2 DISTRIBUTE_VERSION = 0.6.26 -ZOOKEEPER_VERSION = 3.3.4 +ZOOKEEPER_VERSION = 3.3.6 http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/zookeeper-3.3.4.tar.gz -- diff --git a/3rdparty/zookeeper-3.3.4.tar.gz b/3rdparty/zookeeper-3.3.4.tar.gz deleted file mode 100644 index 09d4924..000 Binary files a/3rdparty/zookeeper-3.3.4.tar.gz and /dev/null differ http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/zookeeper-3.3.6.tar.gz -- diff --git a/3rdparty/zookeeper-3.3.6.tar.gz b/3rdparty/zookeeper-3.3.6.tar.gz new file mode 100644 index 000..5588107 Binary files /dev/null and b/3rdparty/zookeeper-3.3.6.tar.gz differ http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/src/python/setup.py.in -- diff --git a/src/python/setup.py.in b/src/python/setup.py.in index 77fa880..8e6ec55 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -20,7 +20,7 @@ src_python_dist = os.path.join('src', 'python', 'dist') src_python_native = os.path.join('src', 'python', 'native') leveldb = os.path.join('3rdparty', 'leveldb') -zookeeper = os.path.join('3rdparty', 'zookeeper-3.3.4', 'src', 'c') +zookeeper = os.path.join('3rdparty', 'zookeeper-3.3.6', 'src', 'c') libprocess = os.path.join('3rdparty', 'libprocess') # Even though a statically compiled libprocess should include glog,
[2/3] git commit: Fixed a 32 bit compilation issue in files.cpp.
Fixed a 32 bit compilation issue in files.cpp. The call to min(ssize_t, long int) won't compile on 32 bit systems because ssize_t resolves to int on those systems. From: Kevin Lyda ke...@ie.suberic.net Review: https://reviews.apache.org/r/13793 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/56a54f53 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/56a54f53 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/56a54f53 Branch: refs/heads/master Commit: 56a54f536b2d6b2b2bd80fdfd665d1cd5980a9c1 Parents: 6f24656 Author: Benjamin Mahler bmah...@twitter.com Authored: Mon Aug 26 18:20:27 2013 -0700 Committer: Benjamin Mahler bmah...@twitter.com Committed: Mon Aug 26 18:20:27 2013 -0700 -- src/files/files.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/mesos/blob/56a54f53/src/files/files.cpp -- diff --git a/src/files/files.cpp b/src/files/files.cpp index c609505..69d6852 100644 --- a/src/files/files.cpp +++ b/src/files/files.cpp @@ -272,7 +272,7 @@ FutureResponse FilesProcess::read(const Request request) } // Cap the read length at 16 pages. - length = std::min(length, sysconf(_SC_PAGE_SIZE) * 16); + length = std::minssize_t(length, sysconf(_SC_PAGE_SIZE) * 16); if (offset = size) { os::close(fd.get());