Repository: mesos
Updated Branches:
  refs/heads/master 1c2ee5c97 -> f9cb6f496


Refactored and simplified the pid namespace isolator.

The bind mounts in the pid namespace isolator turns out to be
unnecessary as the linux launcher will use freezer to kill all tasks
anyway. It makes the isolator unnecessarily complex, and has a mount
leak bug (MESOS-6145). This patch removes all the unnecessary bind
mounts, making the isolator extremely simple.

Review: https://reviews.apache.org/r/51963


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f9cb6f49
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f9cb6f49
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f9cb6f49

Branch: refs/heads/master
Commit: f9cb6f4963b08285289b246e228c9b180345bbcd
Parents: 1c2ee5c
Author: Jie Yu <yujie....@gmail.com>
Authored: Fri Sep 16 11:36:27 2016 -0700
Committer: Jie Yu <yujie....@gmail.com>
Committed: Fri Sep 16 15:57:58 2016 -0700

----------------------------------------------------------------------
 .../mesos/isolators/namespaces/pid.cpp          | 199 +++----------------
 .../mesos/isolators/namespaces/pid.hpp          |  37 +---
 .../containerizer/mesos/linux_launcher.cpp      |  18 --
 src/tests/containerizer/isolator_tests.cpp      |   2 +-
 src/tests/slave_recovery_tests.cpp              |   4 +-
 5 files changed, 33 insertions(+), 227 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f9cb6f49/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp 
b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
index b41e266..e330bb6 100644
--- a/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
@@ -14,64 +14,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sys/mount.h>
+#include <process/id.hpp>
 
-#include <list>
-#include <set>
-#include <string>
+#include <stout/strings.hpp>
 
-#include <stout/os.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/ls.hpp>
-#include <stout/os/stat.hpp>
-
-#include "linux/fs.hpp"
 #include "linux/ns.hpp"
 
 #include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
 
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
+using process::Future;
+using process::Owned;
 
 using mesos::slave::ContainerConfig;
 using mesos::slave::ContainerLaunchInfo;
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerState;
 using mesos::slave::Isolator;
 
 namespace mesos {
 namespace internal {
 namespace slave {
 
-// The root directory where we bind mount all the namespace handles.
-static const char PID_NS_BIND_MOUNT_ROOT[] = "/var/run/mesos/pidns";
-
-
-// The empty directory that we'll use to mask the namespace handles
-// inside each container. This mount ensures they cannot determine the
-// namespace of another container.
-static const char PID_NS_BIND_MOUNT_MASK_DIR[] = "/var/empty/mesos";
-
-
-// Helper to construct the path to a pid's namespace file.
-inline string nsProcFile(pid_t pid)
-{
-  return path::join("/proc", stringify(pid), "ns", "pid");
-}
-
-
-// Helper to construct the path to the additional reference created
-// for a container's pid namespace.
-inline string nsExtraReference(const ContainerID& containerId)
-{
-  return path::join(PID_NS_BIND_MOUNT_ROOT, stringify(containerId));
-}
-
-
 Try<Isolator*> NamespacesPidIsolatorProcess::create(const Flags& flags)
 {
   // Check for root permission.
@@ -84,22 +45,18 @@ Try<Isolator*> NamespacesPidIsolatorProcess::create(const 
Flags& flags)
     return Error("Pid namespaces are not supported by this kernel");
   }
 
-  // Create the directory where bind mounts of the pid namespace will
-  // be placed.
-  Try<Nothing> mkdir = os::mkdir(PID_NS_BIND_MOUNT_ROOT);
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount root directory at " +
-        string(PID_NS_BIND_MOUNT_ROOT) + ": " + mkdir.error());
+  // Make sure 'linux' launcher is used because only 'linux' launcher
+  // supports cloning pid namespace for the container.
+  if (flags.launcher != "linux") {
+    return Error("'linux' launcher must be used to enable pid namespace");
   }
 
-  // Create the empty directory that will be used to mask the bind
-  // mounts inside each container.
-  mkdir = os::mkdir(PID_NS_BIND_MOUNT_MASK_DIR);
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount mask direcrory at " +
-        string(PID_NS_BIND_MOUNT_MASK_DIR) + ": " + mkdir.error());
+  // Make sure 'filesystem/linux' isolator is used.
+  // NOTE: 'filesystem/linux' isolator will make sure mounts in the
+  // child mount namespace will not be propagated back to the host
+  // mount namespace.
+  if (!strings::contains(flags.isolation, "filesystem/linux")) {
+    return Error("'filesystem/linux' must be used to enable pid namespace");
   }
 
   return new MesosIsolator(Owned<MesosIsolatorProcess>(
@@ -107,49 +64,8 @@ Try<Isolator*> NamespacesPidIsolatorProcess::create(const 
Flags& flags)
 }
 
 
-Result<ino_t> NamespacesPidIsolatorProcess::getNamespace(
-    const ContainerID& containerId)
-{
-  const string target = nsExtraReference(containerId);
-
-  if (os::exists(target)) {
-    return os::stat::inode(target);
-  }
-
-  return None();
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  hashset<ContainerID> recovered;
-  foreach (const ContainerState& state, states) {
-    recovered.insert(state.container_id());
-  }
-
-  // Clean up any unknown orphaned bind mounts and empty files. Known
-  // orphan bind mounts and empty files will be destroyed by the
-  // containerizer using the normal cleanup path. See MESOS-2367 for
-  // details.
-  Try<list<string>> entries = os::ls(PID_NS_BIND_MOUNT_ROOT);
-  if (entries.isError()) {
-    return Failure("Failed to list existing containers in '" +
-                   string(PID_NS_BIND_MOUNT_ROOT) + "': " + entries.error());
-  }
-
-  foreach (const string& entry, entries.get()) {
-    ContainerID containerId;
-    containerId.set_value(entry);
-
-    if (!recovered.contains(containerId) && !orphans.contains(containerId)) {
-      cleanup(containerId);
-    }
-  }
-
-  return Nothing();
-}
+NamespacesPidIsolatorProcess::NamespacesPidIsolatorProcess()
+  : ProcessBase(process::ID::generate("pid-namespace-isolator")) {}
 
 
 Future<Option<ContainerLaunchInfo>> NamespacesPidIsolatorProcess::prepare(
@@ -159,83 +75,24 @@ Future<Option<ContainerLaunchInfo>> 
NamespacesPidIsolatorProcess::prepare(
   ContainerLaunchInfo launchInfo;
   launchInfo.set_namespaces(CLONE_NEWPID | CLONE_NEWNS);
 
-  // Mask the bind mount root directory in each container so
-  // containers cannot see the namespace bind mount of other
-  // containers.
-  launchInfo.add_pre_exec_commands()->set_value(
-      "mount -n --bind " + string(PID_NS_BIND_MOUNT_MASK_DIR) +
-      " " + string(PID_NS_BIND_MOUNT_ROOT));
-
-  // Mount /proc for the container's pid namespace to show the
-  // container's pids (and other /proc files), not the parent's. We
-  // first recursively make the mount private because /proc is usually
-  // marked explicitly as shared (see /proc/self/mountinfo) and
-  // changes would propagate to the parent's /proc mount otherwise. We
-  // then mount /proc with the standard options. This technique was
-  // taken from unshare.c in utils-linux for --mount-proc. We use the
-  // -n flag so the mount is not added to the mtab where it will not
-  // be correctly removed with the namespace terminates.
-  launchInfo.add_pre_exec_commands()->set_value(
-      "mount none /proc --make-private -o rec");
+  // Mount /proc with standard options for the container's pid
+  // namespace to show the container's pids (and other /proc files),
+  // not the parent's. This technique was taken from unshare.c in
+  // utils-linux for --mount-proc. We use the -n flag so the mount is
+  // not added to the mtab where it will not be correctly removed when
+  // the namespace terminates.
+  //
+  // NOTE: 'filesystem/linux' isolator will make sure mounts in the
+  // child mount namespace will not be propagated back to the host
+  // mount namespace.
+  //
+  // TOOD(jieyu): Consider unmount the existing /proc.
   launchInfo.add_pre_exec_commands()->set_value(
       "mount -n -t proc proc /proc -o nosuid,noexec,nodev");
 
   return launchInfo;
 }
 
-
-Future<Nothing> NamespacesPidIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  const string source = nsProcFile(pid);
-  const string target = nsExtraReference(containerId);
-
-  // Create a bind mount of the pid namespace so we can control the
-  // lifetime of the pid namespace. This lets us identify the
-  // container's pid namespace, even if the leading pid has exited.
-  Try<Nothing> touch = os::touch(target);
-  if (touch.isError()) {
-    return Failure("Failed to create bind mount target: " + touch.error());
-  }
-
-  Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, nullptr);
-  if (mount.isError()) {
-    return Failure(
-        "Failed to mount pid namespace handle from " +
-        source + " to " + target + ": " + mount.error());
-  }
-
-  return Nothing();
-}
-
-
-// An old glibc might not have this symbol.
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  const string target = nsExtraReference(containerId);
-
-  if (os::exists(target)) {
-    // We don't expect anyone to have a reference to target but do a
-    // lazy umount in case. We do not want to force the umount; it
-    // will not cause an issue if this umount is delayed.
-    Try<Nothing> unmount = fs::unmount(target, MNT_DETACH);
-
-    // This will fail if the unmount hasn't completed yet but this
-    // only leaks a uniquely named empty file that will cleaned up as
-    // an orphan on recovery.
-    os::rm(target);
-  }
-
-  return Nothing();
-}
-
 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/f9cb6f49/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp 
b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
index 1c74ba2..ccb525a 100644
--- a/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
@@ -17,14 +17,6 @@
 #ifndef __NAMESPACES_PID_ISOLATOR_HPP__
 #define __NAMESPACES_PID_ISOLATOR_HPP__
 
-#include <sys/types.h>
-
-#include <string>
-
-#include <process/id.hpp>
-
-#include <stout/result.hpp>
-
 #include "slave/flags.hpp"
 
 #include "slave/containerizer/mesos/isolator.hpp"
@@ -33,44 +25,19 @@ namespace mesos {
 namespace internal {
 namespace slave {
 
-// This isolator itself does not specify the necessary clone() flags
-// (see the LinuxLauncher for that) but it is used to keep track of a
-// container's pid namespace through a bind mount and exposed by
-// getNamespace().
 class NamespacesPidIsolatorProcess : public MesosIsolatorProcess
 {
 public:
   static Try<mesos::slave::Isolator*> create(const Flags& flags);
 
-  // Return the pid namespace of the container. Returns None if the
-  // container was not created in a separate pid namespace, i.e.,
-  // processes are in the same namespace as the slave. This is used by
-  // the LinuxLauncher to determine if it can kill the leading process
-  // in the container and let the kernel kill the remaining processes.
-  // A container may not have a pid namespace if it was created
-  // without the namespaces/pid isolator and the slave was
-  // subsequently restarted with namespaces/pid enabled.
-  static Result<ino_t> getNamespace(const ContainerID& container);
-
-  NamespacesPidIsolatorProcess()
-    : ProcessBase(process::ID::generate("mesos-pid-isolator")) {}
-
   virtual ~NamespacesPidIsolatorProcess() {}
 
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
   virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
       const ContainerID& containerId,
       const mesos::slave::ContainerConfig& containerConfig);
 
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
+private:
+  NamespacesPidIsolatorProcess();
 };
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/f9cb6f49/src/slave/containerizer/mesos/linux_launcher.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/linux_launcher.cpp 
b/src/slave/containerizer/mesos/linux_launcher.cpp
index d0f9285..1c49f56 100644
--- a/src/slave/containerizer/mesos/linux_launcher.cpp
+++ b/src/slave/containerizer/mesos/linux_launcher.cpp
@@ -38,8 +38,6 @@
 
 #include "slave/containerizer/mesos/linux_launcher.hpp"
 
-#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
-
 using namespace process;
 
 using std::list;
@@ -345,22 +343,6 @@ Future<Nothing> LinuxLauncher::destroy(const ContainerID& 
containerId)
     return Nothing();
   }
 
-  Result<ino_t> containerPidNs =
-    NamespacesPidIsolatorProcess::getNamespace(containerId);
-
-  if (containerPidNs.isSome()) {
-    LOG(INFO) << "Using pid namespace to destroy container " << containerId;
-
-    return ns::pid::destroy(containerPidNs.get())
-      .then(lambda::bind(
-            (Future<Nothing>(*)(const string&,
-                                const string&,
-                                const Duration&))(&cgroups::destroy),
-            freezerHierarchy,
-            cgroup(containerId),
-            cgroups::DESTROY_TIMEOUT));
-  }
-
   // Try to clean up using just the freezer cgroup.
   return cgroups::destroy(
       freezerHierarchy,

http://git-wip-us.apache.org/repos/asf/mesos/blob/f9cb6f49/src/tests/containerizer/isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/isolator_tests.cpp 
b/src/tests/containerizer/isolator_tests.cpp
index dbe6147..93ce751 100644
--- a/src/tests/containerizer/isolator_tests.cpp
+++ b/src/tests/containerizer/isolator_tests.cpp
@@ -309,7 +309,7 @@ class NamespacesPidIsolatorTest : public MesosTest {};
 TEST_F(NamespacesPidIsolatorTest, ROOT_PidNamespace)
 {
   slave::Flags flags = CreateSlaveFlags();
-  flags.isolation = "namespaces/pid";
+  flags.isolation = "filesystem/linux,namespaces/pid";
 
   string directory = os::getcwd(); // We're inside a temporary sandbox.
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/f9cb6f49/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp 
b/src/tests/slave_recovery_tests.cpp
index 389daf0..23cf9d8 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -4098,7 +4098,7 @@ TEST_F(MesosContainerizerSlaveRecoveryTest, 
CGROUPS_ROOT_PidNamespaceForward)
   slave.get()->terminate();
 
   // Start a slave using a containerizer with pid namespace isolation.
-  flags.isolation = "cgroups/cpu,cgroups/mem,namespaces/pid";
+  flags.isolation = "cgroups/cpu,cgroups/mem,filesystem/linux,namespaces/pid";
 
   _containerizer = MesosContainerizer::create(flags, true, &fetcher);
   ASSERT_SOME(_containerizer);
@@ -4137,7 +4137,7 @@ TEST_F(MesosContainerizerSlaveRecoveryTest, 
CGROUPS_ROOT_PidNamespaceBackward)
 
   // Start a slave using a containerizer with pid namespace isolation.
   slave::Flags flags = this->CreateSlaveFlags();
-  flags.isolation = "cgroups/cpu,cgroups/mem,namespaces/pid";
+  flags.isolation = "cgroups/cpu,cgroups/mem,filesystem/linux,namespaces/pid";
   flags.agent_subsystems = "";
 
   Fetcher fetcher;

Reply via email to