Repository: mesos Updated Branches: refs/heads/master 7e0074852 -> fd9b28331
Added the linux filesystem isolator. Review: https://reviews.apache.org/r/37236 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f55e36a5 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f55e36a5 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f55e36a5 Branch: refs/heads/master Commit: f55e36a5a7e0e00186bdcb3c1dc7601f9ba90bb0 Parents: 7e00748 Author: Jie Yu <[email protected]> Authored: Fri Aug 7 15:32:48 2015 -0700 Committer: Jie Yu <[email protected]> Committed: Wed Aug 12 11:49:27 2015 -0700 ---------------------------------------------------------------------- include/mesos/mesos.proto | 5 +- src/Makefile.am | 2 + .../isolators/filesystem/linux.cpp | 445 +++++++++++++++++++ .../isolators/filesystem/linux.hpp | 118 +++++ src/slave/containerizer/mesos/containerizer.cpp | 19 + 5 files changed, 588 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/f55e36a5/include/mesos/mesos.proto ---------------------------------------------------------------------- diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index 8a423a5..a4a9c6f 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -1243,7 +1243,10 @@ message Volume { required Mode mode = 3; - // Absolute path pointing to a directory or file in the container. + // Path pointing to a directory or file in the container. If the + // path is a relative path, it is relative to the container work + // directory. If the path is an absolute path, that path must + // already exist. required string container_path = 1; // The following specifies the source of this volume. At most one of http://git-wip-us.apache.org/repos/asf/mesos/blob/f55e36a5/src/Makefile.am ---------------------------------------------------------------------- diff --git a/src/Makefile.am b/src/Makefile.am index 07502f0..a27cde2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -595,6 +595,7 @@ if OS_LINUX libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/mem.cpp libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/perf_event.cpp libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/namespaces/pid.cpp + libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/linux.cpp libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/shared.cpp libmesos_no_3rdparty_la_SOURCES += slave/containerizer/linux_launcher.cpp else @@ -717,6 +718,7 @@ libmesos_no_3rdparty_la_SOURCES += \ slave/containerizer/isolators/cgroups/mem.hpp \ slave/containerizer/isolators/cgroups/perf_event.hpp \ slave/containerizer/isolators/namespaces/pid.hpp \ + slave/containerizer/isolators/filesystem/linux.hpp \ slave/containerizer/isolators/filesystem/posix.hpp \ slave/containerizer/isolators/filesystem/shared.hpp \ slave/containerizer/mesos/containerizer.hpp \ http://git-wip-us.apache.org/repos/asf/mesos/blob/f55e36a5/src/slave/containerizer/isolators/filesystem/linux.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/linux.cpp b/src/slave/containerizer/isolators/filesystem/linux.cpp new file mode 100644 index 0000000..1d95373 --- /dev/null +++ b/src/slave/containerizer/isolators/filesystem/linux.cpp @@ -0,0 +1,445 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <list> +#include <sstream> +#include <string> + +#include <glog/logging.h> + +#include <process/collect.hpp> + +#include <stout/error.hpp> +#include <stout/foreach.hpp> +#include <stout/os.hpp> +#include <stout/path.hpp> +#include <stout/stringify.hpp> +#include <stout/strings.hpp> + +#include "linux/fs.hpp" +#include "linux/ns.hpp" + +#include "slave/containerizer/isolators/filesystem/linux.hpp" + +using namespace process; + +using std::list; +using std::ostringstream; +using std::string; + +using mesos::slave::ContainerState; +using mesos::slave::ContainerLimitation; +using mesos::slave::ContainerPrepareInfo; +using mesos::slave::Isolator; + +namespace mesos { +namespace internal { +namespace slave { + +Try<Isolator*> LinuxFilesystemIsolatorProcess::create( + const Flags& flags, + const hashmap<Image::Type, Owned<Provisioner>>& provisioners) +{ + Result<string> user = os::user(); + if (!user.isSome()) { + return Error("Failed to determine user: " + + (user.isError() ? user.error() : "username not found")); + } + + if (user.get() != "root") { + return Error("LinuxFilesystemIsolator requires root privileges"); + } + + Owned<MesosIsolatorProcess> process( + new LinuxFilesystemIsolatorProcess(flags, provisioners)); + + return new MesosIsolator(process); +} + + +LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess( + const Flags& _flags, + const hashmap<Image::Type, Owned<Provisioner>>& _provisioners) + : flags(_flags), + provisioners(_provisioners) {} + + +LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {} + + +Future<Option<int>> LinuxFilesystemIsolatorProcess::namespaces() +{ + return CLONE_NEWNS; +} + + +Future<Nothing> LinuxFilesystemIsolatorProcess::recover( + const list<ContainerState>& states, + const hashset<ContainerID>& orphans) +{ + list<Future<Nothing>> futures; + foreachvalue (const Owned<Provisioner>& provisioner, provisioners) { + futures.push_back(provisioner->recover(states, orphans)); + } + + return collect(futures) + .then(defer(PID<LinuxFilesystemIsolatorProcess>(this), + &LinuxFilesystemIsolatorProcess::_recover, + states, + orphans)); +} + + +Future<Nothing> LinuxFilesystemIsolatorProcess::_recover( + const list<ContainerState>& states, + const hashset<ContainerID>& orphans) +{ + foreach (const ContainerState& state, states) { + infos.put(state.container_id(), Owned<Info>(new Info(state.directory()))); + } + + // TODO(jieyu): Clean up unknown containers' work directory mounts + // and the corresponding persistent volume mounts. This can be + // achieved by iterating the mount table and find those unknown + // mounts whose sources are under the slave 'work_dir'. + + return Nothing(); +} + + +Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const string& directory, + const Option<string>& user) +{ + if (infos.contains(containerId)) { + return Failure("Container has already been prepared"); + } + + infos.put(containerId, Owned<Info>(new Info(directory))); + + // Provision the root filesystem if needed. + if (executorInfo.has_container()) { + CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS); + + if (executorInfo.container().mesos().has_image()) { + const Image& image = executorInfo.container().mesos().image(); + + if (!provisioners.contains(image.type())) { + return Failure( + "No suitable provisioner found for container image type '" + + stringify(image.type()) + "'"); + } + + return provisioners[image.type()]->provision(containerId, image) + .then(defer(PID<LinuxFilesystemIsolatorProcess>(this), + &LinuxFilesystemIsolatorProcess::_prepare, + containerId, + executorInfo, + directory, + user, + lambda::_1)); + } + } + + // TODO(jieyu): Provision images in volumes as well. + + return _prepare(containerId, executorInfo, directory, user, None()); +} + + +Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const string& directory, + const Option<string>& user, + const Option<string>& rootfs) +{ + CHECK(infos.contains(containerId)); + + ContainerPrepareInfo prepareInfo; + + // If the container changes its root filesystem, we need to mount + // the container's work directory into its root filesystem (creating + // it if needed) so that the executor and the task can access the + // work directory. + // + // NOTE: The mount of the work directory must be a shared mount in + // the host filesystem so that any mounts underneath it will + // propagate into the container's mount namespace. This is how we + // can update persistent volumes for the container. + if (rootfs.isSome()) { + // This is the mount point of the work directory in the root filesystem. + const string sandbox = path::join(rootfs.get(), flags.sandbox_directory); + + if (!os::exists(sandbox)) { + Try<Nothing> mkdir = os::mkdir(sandbox); + if (mkdir.isError()) { + return Failure( + "Failed to create sandbox mount point at '" + + sandbox + "': " + mkdir.error()); + } + } + + Try<Nothing> mount = fs::mount( + directory, + sandbox, + None(), + MS_BIND, + NULL); + + if (mount.isError()) { + return Failure( + "Failed to mount work directory '" + directory + + "' to '" + sandbox + "': " + mount.error()); + } + + mount = fs::mount( + None(), + sandbox, + None(), + MS_SHARED, + NULL); + + if (mount.isError()) { + return Failure( + "Failed to mark work directory '" + directory + + "' as a shared mount: " + mount.error()); + } + + prepareInfo.set_rootfs(rootfs.get()); + } + + // Prepare the commands that will be run in the container's mount + // namespace right after forking the executor process. We use these + // commands to mount those volumes specified in the container info + // so that they don't pollute the host mount namespace. + if (executorInfo.has_container() && + executorInfo.container().volumes_size() > 0) { + Try<string> _script = script(executorInfo, directory, rootfs); + if (_script.isError()) { + return Failure("Failed to generate isolation script: " + _script.error()); + } + + CommandInfo* command = prepareInfo.add_commands(); + command->set_value(_script.get()); + } + + return update(containerId, executorInfo.resources()) + .then([prepareInfo]() -> Future<Option<ContainerPrepareInfo>> { + return prepareInfo; + }); +} + + +Try<string> LinuxFilesystemIsolatorProcess::script( + const ExecutorInfo& executorInfo, + const string& directory, + const Option<string>& rootfs) +{ + CHECK(executorInfo.has_container()); + + ostringstream out; + out << "#!/bin/sh\n"; + out << "set -x -e\n"; + + foreach (const Volume& volume, executorInfo.container().volumes()) { + if (!volume.has_host_path()) { + return Error("A volume misses 'host_path'"); + } + + // If both 'host_path' and 'container_path' are relative paths, + // return an error because the user can just directly access the + // volume in the work directory. + if (!strings::startsWith(volume.host_path(), "/") && + !strings::startsWith(volume.container_path(), "/")) { + return Error( + "Both 'host_path' and 'container_path' of a volume are relative"); + } + + // Determine the source of the mount. + string source; + + if (strings::startsWith(volume.host_path(), "/")) { + source = volume.host_path(); + + // An absolute path must already exist. + if (!os::exists(source)) { + return Error("Absolute host path does not exist"); + } + } else { + // Path is interpreted as relative to the work directory. + source = path::join(directory, volume.host_path()); + + // TODO(jieyu): We need to check that source resolves under the + // work directory because a user can potentially use a container + // path like '../../abc'. + + if (!os::exists(source)) { + Try<Nothing> mkdir = os::mkdir(source); + if (mkdir.isError()) { + return Error( + "Failed to create the source of the mount at '" + + source + "': " + mkdir.error()); + } + + // TODO(idownes): Consider setting ownership and mode. + } + } + + // Determine the target of the mount. + string target; + + if (strings::startsWith(volume.container_path(), "/")) { + if (rootfs.isSome()) { + target = path::join(rootfs.get(), volume.container_path()); + } else { + target = volume.container_path(); + } + + // An absolute path must already exist. This is because we want + // to avoid creating mount points outside the work directory in + // the host filesystem or in the container filesystem root. + if (!os::exists(target)) { + return Error("Absolute container path does not exist"); + } + + // TODO(jieyu): We need to check that target resolves under + // 'rootfs' because a user can potentially use a container path + // like '/../../abc'. + } else { + if (rootfs.isSome()) { + target = path::join(rootfs.get(), + flags.sandbox_directory, + volume.container_path()); + } else { + target = path::join(directory, volume.container_path()); + } + + // TODO(jieyu): We need to check that target resolves under the + // sandbox because a user can potentially use a container path + // like '../../abc'. + + if (!os::exists(target)) { + Try<Nothing> mkdir = os::mkdir(target); + if (mkdir.isError()) { + return Error( + "Failed to create the target of the mount at '" + + target + "': " + mkdir.error()); + } + } + } + + // TODO(jieyu): Consider the mode in the volume. + out << "mount -n --bind '" << source << "' '" << target << "'\n"; + } + + return out.str(); +} + + +Future<Nothing> LinuxFilesystemIsolatorProcess::isolate( + const ContainerID& containerId, + pid_t pid) +{ + // No-op, isolation happens when unsharing the mount namespace. + return Nothing(); +} + + +Future<ContainerLimitation> LinuxFilesystemIsolatorProcess::watch( + const ContainerID& containerId) +{ + // No-op. + return Future<ContainerLimitation>(); +} + + +Future<Nothing> LinuxFilesystemIsolatorProcess::update( + const ContainerID& containerId, + const Resources& resources) +{ + // TODO(jieyu): Update persistent volumes in this function. + return Nothing(); +} + + +Future<ResourceStatistics> LinuxFilesystemIsolatorProcess::usage( + const ContainerID& containerId) +{ + // No-op, no usage gathered. + return ResourceStatistics(); +} + + +Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup( + const ContainerID& containerId) +{ + if (!infos.contains(containerId)) { + VLOG(1) << "Ignoring cleanup request for unknown container: " + << containerId; + + return Nothing(); + } + + const Owned<Info>& info = infos[containerId]; + + // Cleanup the mounts for this container in the host mount + // namespace, including container's work directory (if container + // root filesystem is used), and all the persistent volume mounts. + // + // NOTE: We don't need to cleanup mounts in the container's mount + // namespace because it's done automatically by the kernel when the + // mount namespace is destroyed after the last process terminates. + Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); + if (table.isError()) { + return Failure("Failed to get mount table: " + table.error()); + } + + foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) { + // NOTE: Currently, all persistent volumes are mounted at targets + // under the container's work directory. + if (entry.root == info->directory || + strings::startsWith(entry.target, info->directory)) { + LOG(INFO) << "Unmounting '" << entry.target + << "' for container " << containerId; + + Try<Nothing> unmount = fs::unmount(entry.target, MNT_DETACH); + if (unmount.isError()) { + return Failure( + "Failed to unmount '" + entry.target + "': " + unmount.error()); + } + } + } + + infos.erase(containerId); + + // Destroy the provisioned root filesystem. + list<Future<bool>> futures; + foreachvalue (const Owned<Provisioner>& provisioner, provisioners) { + futures.push_back(provisioner->destroy(containerId)); + } + + return collect(futures) + .then([]() -> Future<Nothing> { return Nothing(); }); +} + +} // namespace slave { +} // namespace internal { +} // namespace mesos { http://git-wip-us.apache.org/repos/asf/mesos/blob/f55e36a5/src/slave/containerizer/isolators/filesystem/linux.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/linux.hpp b/src/slave/containerizer/isolators/filesystem/linux.hpp new file mode 100644 index 0000000..7fb1bdc --- /dev/null +++ b/src/slave/containerizer/isolators/filesystem/linux.hpp @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LINUX_FILESYSTEM_ISOLATOR_HPP__ +#define __LINUX_FILESYSTEM_ISOLATOR_HPP__ + +#include <mesos/mesos.hpp> +#include <mesos/resources.hpp> + +#include <process/owned.hpp> + +#include <stout/hashmap.hpp> + +#include "slave/flags.hpp" + +#include "slave/containerizer/isolator.hpp" +#include "slave/containerizer/provisioner.hpp" + +namespace mesos { +namespace internal { +namespace slave { + +// The filesystem isolator on Linux that is responsible for preparing +// the root filesystems and volumes (e.g., persistent volumes) for +// containers. It relies on Linux mount namespace to prevent mounts of +// a container from being propagated to the host mount table. +class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess +{ +public: + static Try<mesos::slave::Isolator*> create( + const Flags& flags, + const hashmap<Image::Type, process::Owned<Provisioner>>& provisioners); + + virtual ~LinuxFilesystemIsolatorProcess(); + + virtual process::Future<Option<int>> namespaces(); + + virtual process::Future<Nothing> recover( + const std::list<mesos::slave::ContainerState>& states, + const hashset<ContainerID>& orphans); + + virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const std::string& directory, + const Option<std::string>& user); + + virtual process::Future<Nothing> isolate( + const ContainerID& containerId, + pid_t pid); + + virtual process::Future<mesos::slave::ContainerLimitation> watch( + const ContainerID& containerId); + + virtual process::Future<Nothing> update( + const ContainerID& containerId, + const Resources& resources); + + virtual process::Future<ResourceStatistics> usage( + const ContainerID& containerId); + + virtual process::Future<Nothing> cleanup( + const ContainerID& containerId); + +private: + LinuxFilesystemIsolatorProcess( + const Flags& flags, + const hashmap<Image::Type, process::Owned<Provisioner>>& provisioners); + + process::Future<Nothing> _recover( + const std::list<mesos::slave::ContainerState>& states, + const hashset<ContainerID>& orphans); + + process::Future<Option<mesos::slave::ContainerPrepareInfo>> _prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const std::string& directory, + const Option<std::string>& user, + const Option<std::string>& rootfs); + + Try<std::string> script( + const ExecutorInfo& executorInfo, + const std::string& directory, + const Option<std::string>& rootfs); + + const Flags flags; + + struct Info + { + Info(const std::string& _directory) : directory(_directory) {} + + const std::string directory; + }; + + hashmap<ContainerID, process::Owned<Info>> infos; + hashmap<Image::Type, process::Owned<Provisioner>> provisioners; +}; + +} // namespace slave { +} // namespace internal { +} // namespace mesos { + +#endif // __LINUX_FILESYSTEM_ISOLATOR_HPP__ http://git-wip-us.apache.org/repos/asf/mesos/blob/f55e36a5/src/slave/containerizer/mesos/containerizer.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp index 2cbb879..377de50 100644 --- a/src/slave/containerizer/mesos/containerizer.cpp +++ b/src/slave/containerizer/mesos/containerizer.cpp @@ -46,6 +46,7 @@ #ifdef __linux__ #include "slave/containerizer/linux_launcher.hpp" #endif +#include "slave/containerizer/provisioner.hpp" #include "slave/containerizer/isolators/posix.hpp" @@ -57,6 +58,9 @@ #include "slave/containerizer/isolators/cgroups/perf_event.hpp" #endif +#ifdef __linux__ +#include "slave/containerizer/isolators/filesystem/linux.hpp" +#endif #include "slave/containerizer/isolators/filesystem/posix.hpp" #ifdef __linux__ #include "slave/containerizer/isolators/filesystem/shared.hpp" @@ -138,12 +142,27 @@ Try<MesosContainerizer*> MesosContainerizer::create( LOG(INFO) << "Using isolation: " << isolation; +#ifdef __linux__ + // The provisioner will be used by the 'filesystem/linux' isolator. + Try<hashmap<Image::Type, Owned<Provisioner>>> provisioners = + Provisioner::create(flags, fetcher); + + if (provisioners.isError()) { + return Error("Failed to create provisioner(s): " + provisioners.error()); + } +#endif + // Create a MesosContainerizerProcess using isolators and a launcher. const hashmap<string, lambda::function<Try<Isolator*>(const Flags&)>> creators = { // Filesystem isolators. {"filesystem/posix", &PosixFilesystemIsolatorProcess::create}, #ifdef __linux__ + {"filesystem/linux", lambda::bind(&LinuxFilesystemIsolatorProcess::create, + lambda::_1, + provisioners.get())}, + + // TODO(jieyu): Deprecate this in favor of using filesystem/linux. {"filesystem/shared", &SharedFilesystemIsolatorProcess::create}, #endif
