Repository: mesos Updated Branches: refs/heads/master 7bf1e8a6b -> a57b2eb52
Changed executor state recovery to allow run recovery in absence of executor info. This patch let executor recovery recover runs in the absence of executor info. This is needed as new task-info patch will introduce an intermediate state where the executor info hasn't been check pointed. In this interim, the slave may fail-over and should be in a position to clean up orphan containers (as for now, the containerizer API doesn't provide a way to reconcile the executor info and it is therefore not possible to recover the containers in this case). Review: https://reviews.apache.org/r/20221 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a57b2eb5 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a57b2eb5 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a57b2eb5 Branch: refs/heads/master Commit: a57b2eb523b0f603d60461cbad5599ebd5e776aa Parents: 7bf1e8a Author: Niklas Nielsen <[email protected]> Authored: Thu Apr 17 11:29:32 2014 -0700 Committer: Niklas Q. Nielsen <[email protected]> Committed: Thu Apr 17 11:29:32 2014 -0700 ---------------------------------------------------------------------- src/slave/slave.cpp | 5 ++-- src/slave/state.cpp | 70 ++++++++++++++++++++++++------------------------ 2 files changed, 38 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/a57b2eb5/src/slave/slave.cpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 19c5f0d..d6ec87c 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -3070,10 +3070,11 @@ void Framework::recoverExecutor(const ExecutorState& state) CHECK_NOTNULL(slave); - if (state.runs.empty() || state.latest.isNone()) { + if (state.runs.empty() || state.latest.isNone() || state.info.isNone()) { LOG(WARNING) << "Skipping recovery of executor '" << state.id << "' of framework " << id - << " because its latest run cannot be recovered"; + << " because its latest run or executor info" + << " cannot be recovered"; // GC the top level executor work directory. slave->garbageCollect(paths::getExecutorPath( http://git-wip-us.apache.org/repos/asf/mesos/blob/a57b2eb5/src/slave/state.cpp ---------------------------------------------------------------------- diff --git a/src/slave/state.cpp b/src/slave/state.cpp index a2af33c..2889245 100644 --- a/src/slave/state.cpp +++ b/src/slave/state.cpp @@ -274,41 +274,6 @@ Try<ExecutorState> ExecutorState::recover( state.id = executorId; string message; - // Read the executor info. - const string& path = - paths::getExecutorInfoPath(rootDir, slaveId, frameworkId, executorId); - if (!os::exists(path)) { - // This could happen if the slave died after creating the executor - // directory but before it checkpointed the executor info. - LOG(WARNING) << "Failed to find executor info file '" << path << "'"; - return state; - } - - const Result<ExecutorInfo>& executorInfo = - ::protobuf::read<ExecutorInfo>(path); - - if (executorInfo.isError()) { - message = "Failed to read executor info from '" + path + "': " + - executorInfo.error(); - - if (strict) { - return Error(message); - } else { - LOG(WARNING) << message; - state.errors++; - return state; - } - } - - if (executorInfo.isNone()) { - // This could happen if the slave died after opening the file for - // writing but before it checkpointed anything. - LOG(WARNING) << "Found empty executor info file '" << path << "'"; - return state; - } - - state.info = executorInfo.get(); - // Find the runs. Try<list<string> > runs = os::glob(strings::format( paths::EXECUTOR_RUN_PATH, @@ -368,6 +333,41 @@ Try<ExecutorState> ExecutorState::recover( return state; } + // Read the executor info. + const string& path = + paths::getExecutorInfoPath(rootDir, slaveId, frameworkId, executorId); + if (!os::exists(path)) { + // This could happen if the slave died after creating the executor + // directory but before it checkpointed the executor info. + LOG(WARNING) << "Failed to find executor info file '" << path << "'"; + return state; + } + + const Result<ExecutorInfo>& executorInfo = + ::protobuf::read<ExecutorInfo>(path); + + if (executorInfo.isError()) { + message = "Failed to read executor info from '" + path + "': " + + executorInfo.error(); + + if (strict) { + return Error(message); + } else { + LOG(WARNING) << message; + state.errors++; + return state; + } + } + + if (executorInfo.isNone()) { + // This could happen if the slave died after opening the file for + // writing but before it checkpointed anything. + LOG(WARNING) << "Found empty executor info file '" << path << "'"; + return state; + } + + state.info = executorInfo.get(); + return state; }
