Repository: mesos Updated Branches: refs/heads/master 30a56be17 -> 0a2957ed0
Make sure the mesos-fetcher exits if the slave terminates. Review: https://reviews.apache.org/r/24722 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f66fa52e Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f66fa52e Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f66fa52e Branch: refs/heads/master Commit: f66fa52e7efd9c10f9256805e45095591d4833a7 Parents: 30a56be Author: Benjamin Hindman <[email protected]> Authored: Thu Aug 14 17:22:47 2014 -0700 Committer: Benjamin Hindman <[email protected]> Committed: Fri Aug 15 16:59:08 2014 -0700 ---------------------------------------------------------------------- src/launcher/fetcher.cpp | 46 ++++++++++++++++++++ src/slave/containerizer/mesos/containerizer.cpp | 2 + 2 files changed, 48 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/f66fa52e/src/launcher/fetcher.cpp ---------------------------------------------------------------------- diff --git a/src/launcher/fetcher.cpp b/src/launcher/fetcher.cpp index 50e9918..1e3d516 100644 --- a/src/launcher/fetcher.cpp +++ b/src/launcher/fetcher.cpp @@ -20,6 +20,8 @@ #include <mesos/mesos.hpp> +#include <process/io.hpp> + #include <stout/net.hpp> #include <stout/option.hpp> #include <stout/os.hpp> @@ -191,10 +193,54 @@ Try<string> fetch( } +// A helper function for abnormally cancelling the fetching because +// our parent has died (e.g., the slave). +process::Future<Nothing> cancel() +{ + // We don't easily have a handle on any of the children we've + // potentially started since they're hidden behind os::system, + // net::download, HDFS, etc, so we just do a killtree on all of our + // children. + // + // TODO(benh): This still isn't sufficient because we might be in + // the middle of forking a process. What we really need to do is run + // os::kiltree "outside" of this process so that we can pause this + // process too! + Try<os::ProcessTree> pstree = os::pstree(0); + + if (pstree.isSome() && !pstree.get().children.empty()) { + foreach (const os::ProcessTree& child, pstree.get().children) { + // NOTE: We don't follow groups or sessions because it's + // possible we'll end up killing ourselves, or worse, the slave! + os::killtree(child.process.pid, 9); + } + } + + EXIT(1) << "Cancelled fetching because stdin was closed " + << "(e.g., because the parent has exited)"; + + return Nothing(); +} + + int main(int argc, char* argv[]) { GOOGLE_PROTOBUF_VERIFY_VERSION; + // The current semantics of the mesos-fetcher is that it should + // terminate if/when its parent terminates. To support this, we read + // from stdin and if/when we get back an EOF then we "cancel" any + // fetching and exit so we don't become an orphan (which would be + // especially bad in the event calling something like HDFS ends up + // hung indefinitely). + // + // TODO(benh): Introduce a timeout for fetching each URI that can be + // set via flags on the slave. + // + // TODO(benh): Introduce a flag here for changing these semantics. + process::io::read(STDIN_FILENO) + .then(lambda::bind(&cancel)); + CommandInfo commandInfo; // Construct URIs from the encoded environment string. const std::string& uris = os::getenv("MESOS_EXECUTOR_URIS"); http://git-wip-us.apache.org/repos/asf/mesos/blob/f66fa52e/src/slave/containerizer/mesos/containerizer.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp index d0676c5..cdf440d 100644 --- a/src/slave/containerizer/mesos/containerizer.cpp +++ b/src/slave/containerizer/mesos/containerizer.cpp @@ -502,6 +502,8 @@ Future<Nothing> MesosContainerizerProcess::fetch( LOG(INFO) << "Fetching URIs for container '" << containerId << "' using command '" << command << "'"; + // NOTE: It's important that we create a pipe for the mesos-fetcher + // stdin so that when the slave exits it will terminate itself. Try<Subprocess> fetcher = subprocess( command, Subprocess::PIPE(),
