Added persistent volume support for linux filesystem isolator. Review: https://reviews.apache.org/r/37330
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0b3cdecb Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0b3cdecb Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0b3cdecb Branch: refs/heads/master Commit: 0b3cdecb41747563827128717e22351265786bc7 Parents: 03b28b1 Author: Jie Yu <[email protected]> Authored: Mon Aug 10 17:23:45 2015 -0700 Committer: Jie Yu <[email protected]> Committed: Wed Aug 12 16:53:41 2015 -0700 ---------------------------------------------------------------------- .../isolators/filesystem/linux.cpp | 237 ++++++++++++++++++- .../isolators/filesystem/linux.hpp | 10 + 2 files changed, 235 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/0b3cdecb/src/slave/containerizer/isolators/filesystem/linux.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/linux.cpp b/src/slave/containerizer/isolators/filesystem/linux.cpp index 1d95373..f36424e 100644 --- a/src/slave/containerizer/isolators/filesystem/linux.cpp +++ b/src/slave/containerizer/isolators/filesystem/linux.cpp @@ -34,6 +34,8 @@ #include "linux/fs.hpp" #include "linux/ns.hpp" +#include "slave/paths.hpp" + #include "slave/containerizer/isolators/filesystem/linux.hpp" using namespace process; @@ -109,8 +111,26 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::_recover( const list<ContainerState>& states, const hashset<ContainerID>& orphans) { + // Read the mount table in the host mount namespace to recover paths + // to containers' work directories if their root filesystems are + // changed. Method 'cleanup()' relies on this information to clean + // up mounts in the host mount namespace for each container. + Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); + if (table.isError()) { + return Failure("Failed to get mount table: " + table.error()); + } + foreach (const ContainerState& state, states) { - infos.put(state.container_id(), Owned<Info>(new Info(state.directory()))); + Owned<Info> info(new Info(state.directory())); + + foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) { + if (entry.root == info->directory) { + info->sandbox = entry.target; + break; + } + } + + infos.put(state.container_id(), info); } // TODO(jieyu): Clean up unknown containers' work directory mounts @@ -173,6 +193,8 @@ Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare( { CHECK(infos.contains(containerId)); + const Owned<Info>& info = infos[containerId]; + ContainerPrepareInfo prepareInfo; // If the container changes its root filesystem, we need to mount @@ -188,6 +210,9 @@ Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare( // This is the mount point of the work directory in the root filesystem. const string sandbox = path::join(rootfs.get(), flags.sandbox_directory); + // Save the path 'sandbox' which will be used in 'cleanup()'. + info->sandbox = sandbox; + if (!os::exists(sandbox)) { Try<Nothing> mkdir = os::mkdir(sandbox); if (mkdir.isError()) { @@ -259,6 +284,10 @@ Try<string> LinuxFilesystemIsolatorProcess::script( out << "#!/bin/sh\n"; out << "set -x -e\n"; + // Make sure mounts in the container mount namespace do not + // propagate back to the host mount namespace. + out << "mount --make-rslave /\n"; + foreach (const Volume& volume, executorInfo.container().volumes()) { if (!volume.has_host_path()) { return Error("A volume misses 'host_path'"); @@ -375,7 +404,166 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::update( const ContainerID& containerId, const Resources& resources) { - // TODO(jieyu): Update persistent volumes in this function. + // Mount persistent volumes. We do this in the host namespace and + // rely on mount propagation for them to be visible inside the + // container. + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + const Owned<Info>& info = infos[containerId]; + + Resources current = info->resources; + + // We first remove unneeded persistent volumes. + foreach (const Resource& resource, current.persistentVolumes()) { + // This is enforced by the master. + CHECK(resource.disk().has_volume()); + + // Ignore absolute and nested paths. + const string& containerPath = resource.disk().volume().container_path(); + if (strings::contains(containerPath, "/")) { + LOG(WARNING) << "Skipping updating mount for persistent volume " + << resource << " of container " << containerId + << " because the container path '" << containerPath + << "' contains slash"; + continue; + } + + if (resources.contains(resource)) { + continue; + } + + // Determine the target of the mount. + string target; + + if (info->sandbox.isSome()) { + target = path::join(info->sandbox.get(), containerPath); + } else { + target = path::join(info->directory, containerPath); + } + + LOG(INFO) << "Removing mount '" << target << "' for persistent volume " + << resource << " of container " << containerId; + + // The unmount will fail if the task/executor is still using files + // or directories under 'target'. + Try<Nothing> unmount = fs::unmount(target); + if (unmount.isError()) { + return Failure( + "Failed to unmount unneeded persistent volume at '" + + target + "': " + unmount.error()); + } + + // NOTE: This is a non-recursive rmdir. + Try<Nothing> rmdir = os::rmdir(target, false); + if (rmdir.isError()) { + return Failure( + "Failed to remove persistent volume mount point at '" + + target + "': " + rmdir.error()); + } + } + + // We then mount new persistent volumes. + foreach (const Resource& resource, resources.persistentVolumes()) { + // This is enforced by the master. + CHECK(resource.disk().has_volume()); + + // Ignore absolute and nested paths. + const string& containerPath = resource.disk().volume().container_path(); + if (strings::contains(containerPath, "/")) { + LOG(WARNING) << "Skipping updating mount for persistent volume " + << resource << " of container " << containerId + << " because the container path '" << containerPath + << "' contains slash"; + continue; + } + + if (current.contains(resource)) { + continue; + } + + // Determine the source of the mount. + string source = paths::getPersistentVolumePath( + flags.work_dir, + resource.role(), + resource.disk().persistence().id()); + + // Set the ownership of the persistent volume to match that of the + // sandbox directory. + // + // NOTE: Currently, persistent volumes in Mesos are exclusive, + // meaning that if a persistent volume is used by one task or + // executor, it cannot be concurrently used by other task or + // executor. But if we allow multiple executors to use same + // persistent volume at the same time in the future, the ownership + // of the persistent volume may conflict here. + // + // TODO(haosdent): Consider letting the frameworks specify the + // user/group of the persistent volumes. + struct stat s; + if (::stat(info->directory.c_str(), &s) < 0) { + return Failure( + "Failed to get ownership for '" + info->directory + + "': " + strerror(errno)); + } + + LOG(INFO) << "Changing the ownership of the persistent volume at '" + << source << "' with uid " << s.st_uid + << " and gid " << s.st_gid; + + Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, source, true); + if (chown.isError()) { + return Failure( + "Failed to change the ownership of the persistent volume at '" + + source + "' with uid " + stringify(s.st_uid) + + " and gid " + stringify(s.st_gid) + ": " + chown.error()); + } + + // Determine the target of the mount. + string target; + + if (info->sandbox.isSome()) { + target = path::join(info->sandbox.get(), containerPath); + } else { + target = path::join(info->directory, containerPath); + } + + if (os::exists(target)) { + // NOTE: This is possible because 'info->resources' will be + // reset when slave restarts and recovers. When the slave calls + // 'containerizer->update' after the executor re-registers, + // we'll try to re-mount all the already mounted volumes. + + // TODO(jieyu): Check the source of the mount matches the entry + // with the same target in the mount table if one can be found. + // If not, mount the persistent volume as we did below. This is + // possible because the slave could crash after it unmounts the + // volume but before it is able to delete the mount point. + } else { + Try<Nothing> mkdir = os::mkdir(target); + if (mkdir.isError()) { + return Failure( + "Failed to create persistent volume mount point at '" + + target + "': " + mkdir.error()); + } + + LOG(INFO) << "Mounting '" << source << "' to '" << target + << "' for persistent volume " << resource + << " of container " << containerId; + + Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL); + if (mount.isError()) { + return Failure( + "Failed to mount persistent volume from '" + + source + "' to '" + target + "': " + mount.error()); + } + } + } + + // Store the new resources; + info->resources = resources; + return Nothing(); } @@ -400,34 +588,59 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup( const Owned<Info>& info = infos[containerId]; - // Cleanup the mounts for this container in the host mount - // namespace, including container's work directory (if container - // root filesystem is used), and all the persistent volume mounts. - // // NOTE: We don't need to cleanup mounts in the container's mount // namespace because it's done automatically by the kernel when the // mount namespace is destroyed after the last process terminates. + + // The path to the container' work directory which is the parent of + // all the persistent volume mounts. + string sandbox; + + if (info->sandbox.isSome()) { + sandbox = info->sandbox.get(); + } else { + sandbox = info->directory; + } + + // Cleanup the mounts for this container in the host mount + // namespace, including container's work directory (if container + // root filesystem is used), and all the persistent volume mounts. Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); if (table.isError()) { return Failure("Failed to get mount table: " + table.error()); } foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) { - // NOTE: Currently, all persistent volumes are mounted at targets - // under the container's work directory. - if (entry.root == info->directory || - strings::startsWith(entry.target, info->directory)) { - LOG(INFO) << "Unmounting '" << entry.target + // NOTE: All persistent volumes are mounted at targets under the + // container's work directory. + if (entry.target != sandbox && + strings::startsWith(entry.target, sandbox)) { + LOG(INFO) << "Unmounting volume '" << entry.target << "' for container " << containerId; Try<Nothing> unmount = fs::unmount(entry.target, MNT_DETACH); if (unmount.isError()) { return Failure( - "Failed to unmount '" + entry.target + "': " + unmount.error()); + "Failed to unmount volume '" + entry.target + + "': " + unmount.error()); } } } + // Cleanup the container's work directory mount. We only need to do + // that if the container specifies a filesystem root. + if (info->sandbox.isSome()) { + LOG(INFO) << "Unmounting sandbox '" << info->sandbox.get() + << "' for container " << containerId; + + Try<Nothing> unmount = fs::unmount(info->sandbox.get(), MNT_DETACH); + if (unmount.isError()) { + return Failure( + "Failed to unmount sandbox '" + info->sandbox.get() + + "': " + unmount.error()); + } + } + infos.erase(containerId); // Destroy the provisioned root filesystem. http://git-wip-us.apache.org/repos/asf/mesos/blob/0b3cdecb/src/slave/containerizer/isolators/filesystem/linux.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/linux.hpp b/src/slave/containerizer/isolators/filesystem/linux.hpp index 7fb1bdc..ee5b33d 100644 --- a/src/slave/containerizer/isolators/filesystem/linux.hpp +++ b/src/slave/containerizer/isolators/filesystem/linux.hpp @@ -105,6 +105,16 @@ private: Info(const std::string& _directory) : directory(_directory) {} const std::string directory; + + // The absolute path to the container's work directory mount point + // in the new root filesystem if the container changes its root + // filesystem (i.e., '<rootfs>/<flags.sandbox_directory>'). If the + // container does not specify a root filesystem, this field will + // not be set. + Option<std::string> sandbox; + + // Track resources so we can unmount unneeded persistent volumes. + Resources resources; }; hashmap<ContainerID, process::Owned<Info>> infos;
