git commit: Exposed recovery errors that are encountered in non-strict recovery mode.

2013-08-26 Thread vinodkone
Updated Branches:
  refs/heads/master c014cfb2d - 2b86e3906


Exposed recovery errors that are encountered in non-strict
recovery mode.

Review: https://reviews.apache.org/r/13789


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2b86e390
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2b86e390
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2b86e390

Branch: refs/heads/master
Commit: 2b86e3906854bd3afa352e4b623eecfc2145466b
Parents: c014cfb
Author: Vinod Kone vi...@twitter.com
Authored: Fri Aug 23 17:30:58 2013 -0700
Committer: Vinod Kone vi...@twitter.com
Committed: Mon Aug 26 12:11:59 2013 -0400

--
 src/slave/http.cpp  |  2 +-
 src/slave/slave.cpp |  8 +++-
 src/slave/slave.hpp |  3 +++
 src/slave/state.cpp | 34 --
 src/slave/state.hpp | 22 --
 5 files changed, 55 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/http.cpp
--
diff --git a/src/slave/http.cpp b/src/slave/http.cpp
index 073d092..62fbb37 100644
--- a/src/slave/http.cpp
+++ b/src/slave/http.cpp
@@ -289,7 +289,7 @@ FutureResponse Slave::Http::stats(const Request request)
   object.values[valid_status_updates] = slave.stats.validStatusUpdates;
   object.values[invalid_status_updates] = slave.stats.invalidStatusUpdates;
   object.values[registered] = slave.master ? 1 : 0;
-
+  object.values[recovery_errors] = slave.recoveryErrors;
 
   return OK(object, request.query.get(jsonp));
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.cpp
--
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index dd599a7..3e2c600 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -90,7 +90,8 @@ Slave::Slave(const slave::Flags _flags,
 files(_files),
 monitor(_isolator),
 statusUpdateManager(new StatusUpdateManager()),
-metaDir(paths::getMetaRootDir(flags.work_dir)) {}
+metaDir(paths::getMetaRootDir(flags.work_dir)),
+recoveryErrors(0) {}
 
 
 Slave::~Slave()
@@ -2666,6 +2667,11 @@ FutureNothing Slave::recover(bool reconnect, bool 
strict)
 
   info = state.get().info.get(); // Recover the slave info.
 
+  recoveryErrors = state.get().errors;
+  if (recoveryErrors  0) {
+LOG(WARNING)  Errors encountered during recovery:   recoveryErrors;
+  }
+
   // First, recover the frameworks and executors.
   foreachvalue (const FrameworkState frameworkState, state.get().frameworks) {
 recoverFramework(frameworkState);

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.hpp
--
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index aca7f03..ce2b0da 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -334,6 +334,9 @@ private:
 
   // Root meta directory containing checkpointed data.
   const std::string metaDir;
+
+  // Indicates the number of errors ignored in --no-strict recovery mode.
+  unsigned int recoveryErrors;
 };
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/state.cpp
--
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index b66aaa3..51bc8f4 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -86,6 +86,7 @@ TrySlaveState SlaveState::recover(
   return Error(message);
 } else {
   LOG(WARNING)  message;
+  state.errors++;
   return state;
 }
   }
@@ -115,6 +116,7 @@ TrySlaveState SlaveState::recover(
 }
 
 state.frameworks[frameworkId] = framework.get();
+state.errors += framework.get().errors;
   }
 
   return state;
@@ -148,13 +150,14 @@ TryFrameworkState FrameworkState::recover(
 message = Failed to read framework info from ' + path + ':  +
   (frameworkInfo.isError() ? frameworkInfo.error() :  none);
 
-  if (strict) {
-return Error(message);
-  } else {
-LOG(WARNING)  message;
-return state;
-  }
+if (strict) {
+  return Error(message);
+} else {
+  LOG(WARNING)  message;
+  state.errors++;
+  return state;
 }
+  }
 
   state.info = frameworkInfo.get();
 
@@ -177,6 +180,7 @@ TryFrameworkState FrameworkState::recover(
   return Error(message);
 } else {
   LOG(WARNING)  message;
+  state.errors++;
   return state;
 }
   }
@@ -207,6 +211,7 @@ TryFrameworkState FrameworkState::recover(
 }
 
 state.executors[executorId] = executor.get();
+state.errors += executor.get().errors;
   }
 
   return state;
@@ -246,6 +251,7 @@ TryExecutorState ExecutorState::recover(
   

[2/2] git commit: Added SlaveRecoveryTest.MultipleFrameworks test.

2013-08-26 Thread vinodkone
Added SlaveRecoveryTest.MultipleFrameworks test.

From: Jiang Yan Xu y...@jxu.me
Review: https://reviews.apache.org/r/13786


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a866c1a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a866c1a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a866c1a

Branch: refs/heads/master
Commit: 9a866c1a05be10d821df8b569f2af3722986745b
Parents: 9a9daa4
Author: Vinod Kone vi...@twitter.com
Authored: Mon Aug 26 12:18:20 2013 -0400
Committer: Vinod Kone vi...@twitter.com
Committed: Mon Aug 26 12:18:20 2013 -0400

--
 src/tests/slave_recovery_tests.cpp | 155 
 1 file changed, 155 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/9a866c1a/src/tests/slave_recovery_tests.cpp
--
diff --git a/src/tests/slave_recovery_tests.cpp 
b/src/tests/slave_recovery_tests.cpp
index c2c3ce0..c0f130e 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -2105,3 +2105,158 @@ TYPED_TEST(SlaveRecoveryTest, MasterFailover)
 
   this-Shutdown(); // Shutdown before isolator(s) get deallocated.
 }
+
+
+// In this test there are two frameworks and one slave. Each
+// framework launches a task before the slave goes down. We verify
+// that the two frameworks and their tasks are recovered after the
+// slave restarts.
+TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks)
+{
+  TryPIDMaster  master = this-StartMaster();
+  ASSERT_SOME(master);
+
+  TypeParam isolator1;
+
+  slave::Flags flags = this-CreateSlaveFlags();
+
+  TryPIDSlave  slave = this-StartSlave(isolator1, flags);
+  ASSERT_SOME(slave);
+
+  // Framework 1.
+  MockScheduler sched1;
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo1;
+  frameworkInfo1.CopyFrom(DEFAULT_FRAMEWORK_INFO);
+  frameworkInfo1.set_checkpoint(true);
+
+  MesosSchedulerDriver driver1(sched1, frameworkInfo1, master.get());
+
+  EXPECT_CALL(sched1, registered(_, _, _));
+
+  FuturevectorOffer  offers1;
+  EXPECT_CALL(sched1, resourceOffers(_, _))
+.WillOnce(FutureArg1(offers1));
+
+  driver1.start();
+
+  AWAIT_READY(offers1);
+  EXPECT_NE(0u, offers1.get().size());
+
+  // Use part of the resources in the offer so that the rest can be
+  // offered to framework 2.
+  Offer offer1 = offers1.get()[0];
+  offer1.mutable_resources()-CopyFrom(
+  Resources::parse(cpus:1;mem:512).get());
+
+  // Framework 1 launches a task.
+  TaskInfo task1 = createTask(offer1, sleep 1000);
+  vectorTaskInfo tasks1;
+  tasks1.push_back(task1); // Long-running task
+
+  EXPECT_CALL(sched1, statusUpdate(_, _));
+
+  FutureNothing _statusUpdateAcknowledgement1 =
+FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement);
+
+  driver1.launchTasks(offer1.id(), tasks1);
+
+  // Wait for the ACK to be checkpointed.
+  AWAIT_READY(_statusUpdateAcknowledgement1);
+
+  // Framework 2.
+  MockScheduler sched2;
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo2;
+  frameworkInfo2.CopyFrom(DEFAULT_FRAMEWORK_INFO);
+  frameworkInfo2.set_checkpoint(true);
+
+  MesosSchedulerDriver driver2(sched2, frameworkInfo2, master.get());
+
+  EXPECT_CALL(sched2, registered(_, _, _));
+
+  FuturevectorOffer  offers2;
+  EXPECT_CALL(sched2, resourceOffers(_, _))
+.WillOnce(FutureArg1(offers2));
+
+  driver2.start();
+
+  AWAIT_READY(offers2);
+  EXPECT_NE(0u, offers2.get().size());
+
+  // Framework 2 launches a task.
+  TaskInfo task2 = createTask(offers2.get()[0], sleep 1000);
+
+  vectorTaskInfo tasks2;
+  tasks2.push_back(task2); // Long-running task
+
+  EXPECT_CALL(sched2, statusUpdate(_, _));
+
+  FutureNothing _statusUpdateAcknowledgement2 =
+FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement);
+  driver2.launchTasks(offers2.get()[0].id(), tasks2);
+
+  // Wait for the ACK to be checkpointed.
+  AWAIT_READY(_statusUpdateAcknowledgement2);
+
+  this-Stop(slave.get());
+
+  FutureNothing _recover = FUTURE_DISPATCH(_, Slave::_recover);
+
+  FutureReregisterSlaveMessage reregisterSlaveMessage =
+FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  // Restart the slave (use same flags) with a new isolator.
+  TypeParam isolator2;
+
+  slave = this-StartSlave(isolator2, flags);
+  ASSERT_SOME(slave);
+
+  Clock::pause();
+
+  AWAIT_READY(_recover);
+
+  Clock::settle(); // Wait for slave to schedule reregister timeout.
+
+  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+
+  // Wait for the slave to re-register.
+  AWAIT_READY(reregisterSlaveMessage);
+
+  Clock::resume();
+
+  // Expectations for the status changes as a result of killing the
+  // tasks.
+  FutureTaskStatus status1;
+  EXPECT_CALL(sched1, statusUpdate(_, _))
+.WillOnce(FutureArg1(status1))
+  

[1/2] git commit: Added SlaveRecoveryTest.MasterFailover test.

2013-08-26 Thread vinodkone
Updated Branches:
  refs/heads/master 2b86e3906 - 9a866c1a0


Added SlaveRecoveryTest.MasterFailover test.

From: Jiang Yan Xu y...@jxu.me
Review: https://reviews.apache.org/r/13747


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a9daa4c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a9daa4c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a9daa4c

Branch: refs/heads/master
Commit: 9a9daa4c7c0dd3031d84f20d22718c300a234bc2
Parents: 2b86e39
Author: Vinod Kone vi...@twitter.com
Authored: Mon Aug 26 12:17:27 2013 -0400
Committer: Vinod Kone vi...@twitter.com
Committed: Mon Aug 26 12:17:27 2013 -0400

--
 src/master/master.cpp  |   4 +
 src/tests/slave_recovery_tests.cpp | 135 
 2 files changed, 139 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/9a9daa4c/src/master/master.cpp
--
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 511332e..dc155ba 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -979,6 +979,10 @@ void Master::killTask(const FrameworkID frameworkId,
   update-set_uuid(UUID::random().toBytes());
   send(framework-pid, message);
 }
+  } else {
+LOG(WARNING)  Failed to kill task   taskId
+   of framework   frameworkId
+   because the framework cannot be found;
   }
 }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a9daa4c/src/tests/slave_recovery_tests.cpp
--
diff --git a/src/tests/slave_recovery_tests.cpp 
b/src/tests/slave_recovery_tests.cpp
index 548e8c0..c2c3ce0 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -1970,3 +1970,138 @@ TYPED_TEST(SlaveRecoveryTest, PartitionedSlave)
 
   this-Shutdown(); // Shutdown before isolator(s) get deallocated.
 }
+
+
+// This test verifies that if the master changes when the slave is
+// down, the slave can still recover the task when it restarts. We
+// verify its correctness by killing the task from the scheduler.
+TYPED_TEST(SlaveRecoveryTest, MasterFailover)
+{
+  // Step 1. Run a task.
+  TryPIDMaster  master = this-StartMaster();
+  ASSERT_SOME(master);
+
+  TypeParam isolator1;
+
+  slave::Flags flags = this-CreateSlaveFlags();
+
+  TryPIDSlave  slave = this-StartSlave(isolator1, flags);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo;
+  frameworkInfo.CopyFrom(DEFAULT_FRAMEWORK_INFO);
+  frameworkInfo.set_checkpoint(true);
+
+  MesosSchedulerDriver driver(sched, frameworkInfo, master.get());
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  FuturevectorOffer  offers1;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+.WillOnce(FutureArg1(offers1));
+
+  Futureprocess::Message frameworkRegisteredMessage =
+FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+  driver.start();
+
+  AWAIT_READY(frameworkRegisteredMessage);
+  AWAIT_READY(offers1);
+  EXPECT_NE(0u, offers1.get().size());
+
+  TaskInfo task = createTask(offers1.get()[0], sleep 1000);
+  vectorTaskInfo tasks;
+  tasks.push_back(task); // Long-running task
+
+  EXPECT_CALL(sched, statusUpdate(_, _));
+
+  FutureNothing _statusUpdateAcknowledgement =
+FUTURE_DISPATCH(_, Slave::_statusUpdateAcknowledgement);
+
+  driver.launchTasks(offers1.get()[0].id(), tasks);
+
+  // Wait for the ACK to be checkpointed.
+  AWAIT_READY(_statusUpdateAcknowledgement);
+
+  this-Stop(slave.get());
+
+  // Step 2. Simulate failed over master by restarting the master.
+  this-Stop(master.get());
+  master = this-StartMaster();
+  ASSERT_SOME(master);
+
+  FutureNothing registered;
+  EXPECT_CALL(sched, registered(driver, _, _))
+.WillOnce(FutureSatisfy(registered));
+
+  // Simulate a new master detected message to the scheduler.
+  NewMasterDetectedMessage newMasterDetectedMsg;
+  newMasterDetectedMsg.set_pid(master.get());
+
+  process::post(frameworkRegisteredMessage.get().to, newMasterDetectedMsg);
+
+  // Framework should get a registered callback.
+  AWAIT_READY(registered);
+
+  // Step 3. Restart the slave and kill the task.
+  FutureNothing _recover = FUTURE_DISPATCH(_, Slave::_recover);
+
+  FutureReregisterSlaveMessage reregisterSlaveMessage =
+FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  // Restart the slave (use same flags) with a new isolator.
+  TypeParam isolator2;
+
+  slave = this-StartSlave(isolator2, flags);
+  ASSERT_SOME(slave);
+
+  Clock::pause();
+
+  AWAIT_READY(_recover);
+
+  Clock::settle(); // Wait for slave to schedule reregister timeout.
+
+  Clock::advance(EXECUTOR_REREGISTER_TIMEOUT);
+
+  // Wait for the 

git commit: Added a slave recovery test for the case of multiple slaves.

2013-08-26 Thread vinodkone
Updated Branches:
  refs/heads/master 9a866c1a0 - f00832a1d


Added a slave recovery test for the case of multiple slaves.

From: Jie Yu yujie@gmail.com
Review: https://reviews.apache.org/r/13753


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f00832a1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f00832a1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f00832a1

Branch: refs/heads/master
Commit: f00832a1d8cb164cc1e2a12a89565dc36b10d048
Parents: 9a866c1
Author: Vinod Kone vi...@twitter.com
Authored: Mon Aug 26 15:38:19 2013 -0400
Committer: Vinod Kone vi...@twitter.com
Committed: Mon Aug 26 15:38:19 2013 -0400

--
 src/tests/slave_recovery_tests.cpp | 182 
 1 file changed, 182 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/f00832a1/src/tests/slave_recovery_tests.cpp
--
diff --git a/src/tests/slave_recovery_tests.cpp 
b/src/tests/slave_recovery_tests.cpp
index c0f130e..0735dba 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -2260,3 +2260,185 @@ TYPED_TEST(SlaveRecoveryTest, MultipleFrameworks)
 
   this-Shutdown(); // Shutdown before isolator(s) get deallocated.
 }
+
+
+// Create a test fixture for those slave recovery tests that only work
+// when ProcessIsolator is used.
+// TODO(jieyu): We use typed test here because it magically allows us
+// to access protected members in Slave (e.g. Slave::_recover).
+template typename T
+class SlaveRecoveryProcessIsolatorTest : public IsolatorTestT
+{
+public:
+  virtual slave::Flags CreateSlaveFlags()
+  {
+slave::Flags flags = IsolatorTestT::CreateSlaveFlags();
+
+// Setup recovery slave flags.
+flags.checkpoint = true;
+flags.recover = reconnect;
+flags.strict = true;
+
+return flags;
+  }
+};
+
+
+TYPED_TEST_CASE(SlaveRecoveryProcessIsolatorTest,
+::testing::TypesProcessIsolator);
+
+
+// This test verifies that slave recovery works properly even if
+// multiple slaves are co-located on the same host.
+TYPED_TEST(SlaveRecoveryProcessIsolatorTest, MultipleSlaves)
+{
+  TryPIDMaster  master = this-StartMaster();
+  ASSERT_SOME(master);
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo;
+  frameworkInfo.CopyFrom(DEFAULT_FRAMEWORK_INFO);
+  frameworkInfo.set_checkpoint(true);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(sched, frameworkInfo, master.get());
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  driver.start();
+
+  FuturevectorOffer  offers1;
+  EXPECT_CALL(sched, resourceOffers(driver, _))
+.WillOnce(FutureArg1(offers1));
+
+  // Start the first slave.
+  slave::Flags flags1 = this-CreateSlaveFlags();
+  slave::ProcessIsolator isolator1;
+
+  TryPIDSlave  slave1 = this-StartSlave(isolator1, flags1);
+  ASSERT_SOME(slave1);
+
+  AWAIT_READY(offers1);
+  ASSERT_EQ(1u, offers1.get().size());
+
+  // Launch a long running task in the first slave.
+  TaskInfo task1 = createTask(offers1.get()[0], sleep 1000);
+  vectorTaskInfo tasks1;
+  tasks1.push_back(task1);
+
+  EXPECT_CALL(sched, statusUpdate(_, _))
+.Times(1);
+
+  FutureNothing _statusUpdateAcknowledgement1 =
+FUTURE_DISPATCH(slave1.get(), Slave::_statusUpdateAcknowledgement);
+
+  driver.launchTasks(offers1.get()[0].id(), tasks1);
+
+  // Wait for the ACK to be checkpointed.
+  AWAIT_READY(_statusUpdateAcknowledgement1);
+
+  FuturevectorOffer  offers2;
+  EXPECT_CALL(sched, resourceOffers(driver, _))
+.WillOnce(FutureArg1(offers2));
+
+  // Start the second slave.
+  slave::Flags flags2 = this-CreateSlaveFlags();
+  slave::ProcessIsolator isolator2;
+
+  TryPIDSlave  slave2 = this-StartSlave(isolator2, flags2);
+  ASSERT_SOME(slave2);
+
+  AWAIT_READY(offers2);
+  ASSERT_EQ(1u, offers2.get().size());
+
+  // Launch a long running task in each slave.
+  TaskInfo task2 = createTask(offers2.get()[0], sleep 1000);
+  vectorTaskInfo tasks2;
+  tasks2.push_back(task2);
+
+  EXPECT_CALL(sched, statusUpdate(_, _))
+.Times(1);
+
+  FutureNothing _statusUpdateAcknowledgement2 =
+FUTURE_DISPATCH(slave2.get(), Slave::_statusUpdateAcknowledgement);
+
+  driver.launchTasks(offers2.get()[0].id(), tasks2);
+
+  // Wait for the ACKs to be checkpointed.
+  AWAIT_READY(_statusUpdateAcknowledgement2);
+
+  this-Stop(slave1.get());
+  this-Stop(slave2.get());
+
+  FutureNothing _recover1 = FUTURE_DISPATCH(_, Slave::_recover);
+  FutureNothing _recover2 = FUTURE_DISPATCH(_, Slave::_recover);
+
+  FutureReregisterSlaveMessage reregisterSlave1 =
+FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+  FutureReregisterSlaveMessage reregisterSlave2 =
+FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  // Restart both slaves 

git commit: Added a recovery timeout for executor driver self-termination.

2013-08-26 Thread bmahler
Updated Branches:
  refs/heads/master f00832a1d - 5eb50d6e8


Added a recovery timeout for executor driver self-termination.

Review: https://reviews.apache.org/r/13791


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5eb50d6e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5eb50d6e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5eb50d6e

Branch: refs/heads/master
Commit: 5eb50d6e8da1f311de30ca5f1218bb5bcba2236c
Parents: f00832a
Author: Benjamin Mahler bmah...@twitter.com
Authored: Fri Aug 23 19:13:30 2013 -0700
Committer: Benjamin Mahler bmah...@twitter.com
Committed: Mon Aug 26 12:52:07 2013 -0700

--
 src/exec/exec.cpp  | 64 +--
 src/launcher/launcher.cpp  | 10 +++-
 src/launcher/launcher.hpp  |  7 ++-
 src/launcher/main.cpp  | 26 -
 src/slave/cgroups_isolator.cpp |  3 +-
 src/slave/constants.cpp|  1 +
 src/slave/constants.hpp|  1 +
 src/slave/flags.hpp|  9 
 src/slave/process_isolator.cpp |  3 +-
 src/tests/slave_recovery_tests.cpp | 93 +
 10 files changed, 206 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/5eb50d6e/src/exec/exec.cpp
--
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index ca61892..d370560 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -106,7 +106,8 @@ public:
   const ExecutorID _executorId,
   bool _local,
   const string _directory,
-  bool _checkpoint)
+  bool _checkpoint,
+  Duration _recoveryTimeout)
 : ProcessBase(ID::generate(executor)),
   slave(_slave),
   driver(_driver),
@@ -115,10 +116,12 @@ public:
   frameworkId(_frameworkId),
   executorId(_executorId),
   connected(false),
+  connection(UUID::random()),
   local(_local),
   aborted(false),
   directory(_directory),
-  checkpoint(_checkpoint)
+  checkpoint(_checkpoint),
+  recoveryTimeout(_recoveryTimeout)
   {
 installExecutorRegisteredMessage(
 ExecutorProcess::registered,
@@ -195,6 +198,7 @@ protected:
 VLOG(1)  Executor registered on slave   slaveId;
 
 connected = true;
+connection = UUID::random();
 
 Stopwatch stopwatch;
 if (FLAGS_v = 1) {
@@ -216,6 +220,9 @@ protected:
 
 VLOG(1)  Executor re-registered on slave   slaveId;
 
+connected = true;
+connection = UUID::random();
+
 Stopwatch stopwatch;
 if (FLAGS_v = 1) {
   stopwatch.start();
@@ -391,6 +398,23 @@ protected:
 aborted = true;
   }
 
+  void _recoveryTimeout(UUID _connection)
+  {
+// If we're connected, no need to shut down the driver!
+if (connected) {
+  return;
+}
+
+// We need to compare the connections here to ensure there have
+// not been any subsequent re-registrations with the slave in the
+// interim.
+if (connection == _connection) {
+  VLOG(1)  Recovery timeout of   recoveryTimeout   exceeded; 
+   Shutting down;
+  shutdown();
+}
+  }
+
   virtual void exited(const UPID pid)
   {
 if (aborted) {
@@ -402,13 +426,21 @@ protected:
 // successfully registered with the slave, the slave can reconnect with
 // this executor when it comes back up and performs recovery!
 if (checkpoint  connected) {
+  connected = false;
+
   VLOG(1)  Slave exited, but framework has checkpointing enabled. 
-   Waiting to reconnect with slave   slaveId;
+   Waiting   recoveryTimeout   to reconnect with slave 
+   slaveId;
+
+  delay(recoveryTimeout, self(), Self::_recoveryTimeout, connection);
+
   return;
 }
 
 VLOG(1)  Slave exited ... shutting down;
 
+connected = false;
+
 if (!local) {
   // Start the Shutdown Process.
   spawn(new ShutdownProcess(), true);
@@ -494,10 +526,12 @@ private:
   FrameworkID frameworkId;
   ExecutorID executorId;
   bool connected; // Registered with the slave.
+  UUID connection; // UUID to identify the connection instance.
   bool local;
   bool aborted;
   const string directory;
   bool checkpoint;
+  Duration recoveryTimeout;
 
   LinkedHashMapUUID, StatusUpdate updates; // Unacknowledged updates.
 
@@ -587,7 +621,7 @@ Status MesosExecutorDriver::start()
   value = os::getenv(MESOS_SLAVE_PID);
   slave = UPID(value);
   if (!slave) {
-fatal(cannot parse MESOS_SLAVE_PID);
+fatal(Cannot parse MESOS_SLAVE_PID '%s', value.c_str());
   }
 
   // Get slave ID from environment.
@@ -615,6 +649,25 @@ Status MesosExecutorDriver::start()
 checkpoint = false;
   }
 
+  Duration 

Git Push Summary

2013-08-26 Thread bmahler
Updated Tags:  refs/tags/0.14.0-rc2 [created] 2346f6de4


[3/3] git commit: Upgraded ZooKeeper from 3.3.4 to 3.3.6.

2013-08-26 Thread bmahler
Upgraded ZooKeeper from 3.3.4 to 3.3.6.

From: Vinson Lee v...@twitter.com
Review: https://reviews.apache.org/r/13598


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/eb1cd4a7
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/eb1cd4a7
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/eb1cd4a7

Branch: refs/heads/master
Commit: eb1cd4a7c0ad4310f090d4f0643cf4059ac5246b
Parents: 56a54f5
Author: Benjamin Mahler bmah...@twitter.com
Authored: Mon Aug 26 18:26:07 2013 -0700
Committer: Benjamin Mahler bmah...@twitter.com
Committed: Mon Aug 26 18:26:55 2013 -0700

--
 3rdparty/versions.am|   2 +-
 3rdparty/zookeeper-3.3.4.tar.gz | Bin 13543276 - 0 bytes
 3rdparty/zookeeper-3.3.6.tar.gz | Bin 0 - 11833706 bytes
 src/python/setup.py.in  |   2 +-
 4 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/versions.am
--
diff --git a/3rdparty/versions.am b/3rdparty/versions.am
index 5932e1f..5e0b9f8 100644
--- a/3rdparty/versions.am
+++ b/3rdparty/versions.am
@@ -21,4 +21,4 @@
 
 BOTO_VERSION = 2.0b2
 DISTRIBUTE_VERSION = 0.6.26
-ZOOKEEPER_VERSION = 3.3.4
+ZOOKEEPER_VERSION = 3.3.6

http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/zookeeper-3.3.4.tar.gz
--
diff --git a/3rdparty/zookeeper-3.3.4.tar.gz b/3rdparty/zookeeper-3.3.4.tar.gz
deleted file mode 100644
index 09d4924..000
Binary files a/3rdparty/zookeeper-3.3.4.tar.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/3rdparty/zookeeper-3.3.6.tar.gz
--
diff --git a/3rdparty/zookeeper-3.3.6.tar.gz b/3rdparty/zookeeper-3.3.6.tar.gz
new file mode 100644
index 000..5588107
Binary files /dev/null and b/3rdparty/zookeeper-3.3.6.tar.gz differ

http://git-wip-us.apache.org/repos/asf/mesos/blob/eb1cd4a7/src/python/setup.py.in
--
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
index 77fa880..8e6ec55 100644
--- a/src/python/setup.py.in
+++ b/src/python/setup.py.in
@@ -20,7 +20,7 @@ src_python_dist = os.path.join('src', 'python', 'dist')
 src_python_native = os.path.join('src', 'python', 'native')
 
 leveldb = os.path.join('3rdparty', 'leveldb')
-zookeeper = os.path.join('3rdparty', 'zookeeper-3.3.4', 'src', 'c')
+zookeeper = os.path.join('3rdparty', 'zookeeper-3.3.6', 'src', 'c')
 libprocess = os.path.join('3rdparty', 'libprocess')
 
 # Even though a statically compiled libprocess should include glog,



[2/3] git commit: Fixed a 32 bit compilation issue in files.cpp.

2013-08-26 Thread bmahler
Fixed a 32 bit compilation issue in files.cpp.

The call to min(ssize_t, long int) won't compile on 32 bit systems
because ssize_t resolves to int on those systems.

From: Kevin Lyda ke...@ie.suberic.net
Review: https://reviews.apache.org/r/13793


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/56a54f53
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/56a54f53
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/56a54f53

Branch: refs/heads/master
Commit: 56a54f536b2d6b2b2bd80fdfd665d1cd5980a9c1
Parents: 6f24656
Author: Benjamin Mahler bmah...@twitter.com
Authored: Mon Aug 26 18:20:27 2013 -0700
Committer: Benjamin Mahler bmah...@twitter.com
Committed: Mon Aug 26 18:20:27 2013 -0700

--
 src/files/files.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/mesos/blob/56a54f53/src/files/files.cpp
--
diff --git a/src/files/files.cpp b/src/files/files.cpp
index c609505..69d6852 100644
--- a/src/files/files.cpp
+++ b/src/files/files.cpp
@@ -272,7 +272,7 @@ FutureResponse FilesProcess::read(const Request request)
   }
 
   // Cap the read length at 16 pages.
-  length = std::min(length, sysconf(_SC_PAGE_SIZE) * 16);
+  length = std::minssize_t(length, sysconf(_SC_PAGE_SIZE) * 16);
 
   if (offset = size) {
 os::close(fd.get());