Improved failure handling of DockerContainerizer.

Two improvements:

(1) Checked the exit status of 'docker.run()' in order to fail early.

(2) Improved the override on mesos-executor to return the exit status
of the Docker container rather than the exit status of doing 'docker
wait' (which will be 0 even if the container exited with -1 because
'docker wait' correctly "waited" and it prints the container's exit
status to stdout).


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2f2cf5b8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2f2cf5b8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2f2cf5b8

Branch: refs/heads/master
Commit: 2f2cf5b84ec2fb273345a4f2949bd0a714d50f56
Parents: c530a5b
Author: Benjamin Hindman <[email protected]>
Authored: Wed Jul 9 13:13:32 2014 -0700
Committer: Benjamin Hindman <[email protected]>
Committed: Mon Aug 4 15:08:17 2014 -0700

----------------------------------------------------------------------
 src/slave/containerizer/docker.cpp | 52 ++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2f2cf5b8/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp 
b/src/slave/containerizer/docker.cpp
index f7cc630..2aceb35 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -28,6 +28,8 @@
 #include <stout/hashset.hpp>
 #include <stout/os.hpp>
 
+#include "common/status_utils.hpp"
+
 #include "docker/docker.hpp"
 
 #ifdef __linux__
@@ -129,7 +131,8 @@ private:
       const std::string& directory,
       const SlaveID& slaveId,
       const PID<Slave>& slavePid,
-      bool checkpoint);
+      bool checkpoint,
+      const Option<int>& status);
 
   void _destroy(
       const ContainerID& containerId,
@@ -398,10 +401,10 @@ Future<Nothing> DockerContainerizerProcess::recover(
         CHECK_SOME(run.get().forkedPid);
         pid_t pid = run.get().forkedPid.get();
 
-        Future<Option<int > > status = process::reap(pid);
+        statuses[containerId] = process::reap(pid);
 
-        statuses[containerId] = status;
-        status.onAny(defer(self(), &Self::reaped, containerId));
+        statuses[containerId]
+          .onAny(defer(self(), &Self::reaped, containerId));
 
         if (pids.containsValue(pid)) {
           // This should (almost) never occur. There is the
@@ -531,7 +534,8 @@ Future<bool> DockerContainerizerProcess::launch(
                 directory,
                 slaveId,
                 slavePid,
-                checkpoint))
+                checkpoint,
+                lambda::_1))
     .onFailed(defer(self(), &Self::destroy, containerId, false));
 }
 
@@ -543,8 +547,17 @@ Future<bool> DockerContainerizerProcess::_launch(
     const string& directory,
     const SlaveID& slaveId,
     const PID<Slave>& slavePid,
-    bool checkpoint)
+    bool checkpoint,
+    const Option<int>& status)
 {
+  // Try and see if the run failed.
+  if (status.isSome() && status.get() != 0) {
+    // Best effort kill and remove the container just in case.
+    docker.killAndRm(DOCKER_NAME_PREFIX + stringify(containerId));
+    return Failure("Failed to run the container (" +
+                   WSTRINGIFY(status.get()) + ")");
+  }
+
   // Prepare environment variables for the executor.
   map<string, string> env = executorEnvironment(
       executorInfo,
@@ -562,10 +575,13 @@ Future<bool> DockerContainerizerProcess::_launch(
 
   // Construct the mesos-executor "override" to do a 'docker wait'
   // using the "name" we gave the container (to distinguish it from
-  // Docker containers not created by Mesos).
-  // TODO(benh): Get full path to 'docker'.
+  // Docker containers not created by Mesos). Note, however, that we
+  // don't want the exit status from 'docker wait' but rather the exit
+  // status from the container, hence the use of /bin/bash.
   string override =
-    flags.docker + " wait " + DOCKER_NAME_PREFIX + stringify(containerId);
+    "/bin/bash -c 'exit `" +
+    flags.docker + " wait " + DOCKER_NAME_PREFIX + stringify(containerId) +
+    "`'";
 
   Try<Subprocess> s = subprocess(
       executorInfo.command().value() + " --override " + override,
@@ -621,10 +637,10 @@ Future<bool> DockerContainerizerProcess::_launch(
   }
 
   // And finally watch for when the executor gets reaped.
-  Future<Option<int> > status = process::reap(s.get().pid());
+  statuses[containerId] = process::reap(s.get().pid());
 
-  statuses.put(containerId, status);
-  status.onAny(defer(self(), &Self::reaped, containerId));
+  statuses[containerId]
+    .onAny(defer(self(), &Self::reaped, containerId));
 
   return true;
 }
@@ -915,7 +931,17 @@ void DockerContainerizerProcess::_destroy(
     return;
   }
 
-  statuses.get(containerId).get()
+  // It's possible we've been asked to destroy a container that we
+  // aren't actually reaping any status because we failed to start the
+  // container in the first place (e.g., because we returned a Failure
+  // in 'launch' or '_launch'). In this case, we just put a None
+  // status in place so that the rest of the destroy workflow
+  // completes.
+  if (!statuses.contains(containerId)) {
+    statuses[containerId] = None();
+  }
+
+  statuses[containerId]
     .onAny(defer(self(), &Self::__destroy, containerId, killed, lambda::_1));
 }
 

Reply via email to