Windows: Fixed recovery of Mesos containerizer.

The Windows OS deletes the job object created in the agent process when
the agent dies, because no other process holds a handle to it (despite
processes being assigned to the job object). While this is
counter-intuitive, it is the observed behavior. So in order for recovery
to succeed, the containerizer must also hold an otherwise unused handle
to its job object to keep it alive in the kernel, and available for
recovery to find.

Review: https://reviews.apache.org/r/65465


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/04990253
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/04990253
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/04990253

Branch: refs/heads/master
Commit: 04990253e6e7986904141e98e0b6e7008b0f0771
Parents: 0bc1c8c
Author: Andrew Schwartzmeyer <and...@schwartzmeyer.com>
Authored: Wed Jan 31 18:44:52 2018 -0800
Committer: Andrew Schwartzmeyer <and...@schwartzmeyer.com>
Committed: Fri Feb 9 13:05:05 2018 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/launch.cpp | 33 +++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/04990253/src/slave/containerizer/mesos/launch.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/launch.cpp 
b/src/slave/containerizer/mesos/launch.cpp
index cde60fd..75b7eaf 100644
--- a/src/slave/containerizer/mesos/launch.cpp
+++ b/src/slave/containerizer/mesos/launch.cpp
@@ -36,6 +36,8 @@
 #include <stout/os.hpp>
 #include <stout/protobuf.hpp>
 #include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
 #include <stout/unreachable.hpp>
 
 #include <stout/os/int_fd.hpp>
@@ -44,6 +46,10 @@
 #include <stout/os/which.hpp>
 #include <stout/os/write.hpp>
 
+#ifdef __WINDOWS__
+#include <stout/windows/os.hpp>
+#endif // __WINDOWS__
+
 #include <mesos/mesos.hpp>
 #include <mesos/type_utils.hpp>
 
@@ -510,6 +516,29 @@ int MesosContainerizerLaunch::execute()
     cerr << "Failed to install signal handlers: " << signals.error() << endl;
     exitWithStatus(EXIT_FAILURE);
   }
+#else
+  // We need a handle to the job object which this container is associated 
with.
+  // Without this handle, the job object would be destroyed by the OS when the
+  // agent exits (or crashes), making recovery impossible. By holding a handle,
+  // we tie the lifetime of the job object to the container itself. In this 
way,
+  // a recovering agent can reattach to the container by opening a new handle 
to
+  // the job object.
+  const pid_t pid = ::GetCurrentProcessId();
+  const Try<std::wstring> name = os::name_job(pid);
+  if (name.isError()) {
+    cerr << "Failed to create job object name from pid: " << name.error()
+         << endl;
+    exitWithStatus(EXIT_FAILURE);
+  }
+
+  // NOTE: This handle will not be destructed, even though it is a
+  // `SharedHandle`, because it will (purposefully) never go out of scope.
+  Try<SharedHandle> handle = os::open_job(JOB_OBJECT_QUERY, false, name.get());
+  if (handle.isError()) {
+    cerr << "Failed to open job object '" << stringify(name.get())
+         << "' for the current container: " << handle.error() << endl;
+    exitWithStatus(EXIT_FAILURE);
+  }
 #endif // __WINDOWS__
 
   if (flags.launch_info.isNone()) {
@@ -1037,6 +1066,10 @@ int MesosContainerizerLaunch::execute()
   }
 #endif // __WINDOWS__
 
+  // NOTE: On Windows, these functions call `CreateProcess` and then wait for
+  // the new process to exit. Because of this, the `SharedHandle` to the job
+  // object does not go out of scope. This is unlike the POSIX behavior of
+  // `exec`, as the process image is intentionally not replaced.
   if (envp.isSome()) {
     os::execvpe(executable.c_str(), argv, envp.get());
   } else {

Reply via email to