Repository: mesos Updated Branches: refs/heads/master c22527cce -> efb73d343
Add filesystem/posix isolator for persistent volumes. Review: https://reviews.apache.org/r/34135 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/efb73d34 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/efb73d34 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/efb73d34 Branch: refs/heads/master Commit: efb73d3433a4b51b7f5e72dc4c6c10a5d61d39c0 Parents: c22527c Author: Ian Downes <[email protected]> Authored: Tue Jul 21 15:54:15 2015 -0700 Committer: Jie Yu <[email protected]> Committed: Tue Jul 21 17:09:50 2015 -0700 ---------------------------------------------------------------------- src/Makefile.am | 2 + .../isolators/filesystem/posix.cpp | 260 +++++++++++++++++++ .../isolators/filesystem/posix.hpp | 88 +++++++ src/slave/containerizer/mesos/containerizer.cpp | 171 ++---------- src/slave/containerizer/mesos/containerizer.hpp | 6 - 5 files changed, 371 insertions(+), 156 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/efb73d34/src/Makefile.am ---------------------------------------------------------------------- diff --git a/src/Makefile.am b/src/Makefile.am index d99f940..489ddb4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -403,6 +403,7 @@ libmesos_no_3rdparty_la_SOURCES = \ slave/containerizer/external_containerizer.cpp \ slave/containerizer/fetcher.cpp \ slave/containerizer/isolator.cpp \ + slave/containerizer/isolators/filesystem/posix.cpp \ slave/containerizer/isolators/posix/disk.cpp \ slave/containerizer/launcher.cpp \ slave/containerizer/mesos/containerizer.cpp \ @@ -640,6 +641,7 @@ libmesos_no_3rdparty_la_SOURCES += \ slave/containerizer/isolators/cgroups/mem.hpp \ slave/containerizer/isolators/cgroups/perf_event.hpp \ slave/containerizer/isolators/namespaces/pid.hpp \ + slave/containerizer/isolators/filesystem/posix.hpp \ slave/containerizer/isolators/filesystem/shared.hpp \ slave/containerizer/provisioner.hpp \ slave/resource_estimators/noop.hpp \ http://git-wip-us.apache.org/repos/asf/mesos/blob/efb73d34/src/slave/containerizer/isolators/filesystem/posix.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/posix.cpp b/src/slave/containerizer/isolators/filesystem/posix.cpp new file mode 100644 index 0000000..1904279 --- /dev/null +++ b/src/slave/containerizer/isolators/filesystem/posix.cpp @@ -0,0 +1,260 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <list> +#include <string> + +#include <stout/fs.hpp> +#include <stout/os.hpp> +#include <stout/path.hpp> + +#include <mesos/mesos.hpp> +#include <mesos/resources.hpp> + +#include "slave/paths.hpp" + +#include "slave/containerizer/isolators/filesystem/posix.hpp" + +using namespace process; + +using std::list; +using std::string; + +using mesos::slave::ExecutorRunState; +using mesos::slave::Isolator; +using mesos::slave::IsolatorProcess; +using mesos::slave::Limitation; + +namespace mesos { +namespace internal { +namespace slave { + +PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess( + const Flags& _flags) + : flags(_flags) {} + + +PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {} + + +Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags) +{ + process::Owned<IsolatorProcess> process( + new PosixFilesystemIsolatorProcess(flags)); + + return new Isolator(process); +} + + +Future<Nothing> PosixFilesystemIsolatorProcess::recover( + const list<ExecutorRunState>& states, + const hashset<ContainerID>& orphans) +{ + foreach (const ExecutorRunState& state, states) { + infos.put(state.id, Owned<Info>(new Info(state.directory))); + } + + return Nothing(); +} + + +Future<Option<CommandInfo>> PosixFilesystemIsolatorProcess::prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const string& directory, + const Option<string>& rootfs, + const Option<string>& user) +{ + if (infos.contains(containerId)) { + return Failure("Container has already been prepared"); + } + + // Return failure if the container change the filesystem root + // because the symlinks will become invalid in the new root. + if (rootfs.isSome()) { + return Failure("Container root filesystems not supported"); + } + + infos.put(containerId, Owned<Info>(new Info(directory))); + + return update(containerId, executorInfo.resources()) + .then([]() -> Future<Option<CommandInfo>> { return None(); }); +} + + +Future<Nothing> PosixFilesystemIsolatorProcess::isolate( + const ContainerID& containerId, + pid_t pid) +{ + // No-op. + return Nothing(); +} + + +Future<Limitation> PosixFilesystemIsolatorProcess::watch( + const ContainerID& containerId) +{ + // No-op. + return Future<Limitation>(); +} + + +Future<Nothing> PosixFilesystemIsolatorProcess::update( + const ContainerID& containerId, + const Resources& resources) +{ + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + const Owned<Info>& info = infos[containerId]; + + // TODO(jieyu): Currently, we only allow non-nested relative + // container paths for volumes. This is enforced by the master. For + // those volumes, we create symlinks in the executor directory. + Resources current = info->resources; + + // We first remove unneeded persistent volumes. + foreach (const Resource& resource, current.persistentVolumes()) { + // This is enforced by the master. + CHECK(resource.disk().has_volume()); + + // Ignore absolute and nested paths. + const string& containerPath = resource.disk().volume().container_path(); + if (strings::contains(containerPath, "/")) { + LOG(WARNING) << "Skipping updating symlink for persistent volume " + << resource << " of container " << containerId + << " because the container path '" << containerPath + << "' contains slash"; + continue; + } + + if (resources.contains(resource)) { + continue; + } + + string link = path::join(info->directory, containerPath); + + LOG(INFO) << "Removing symlink '" << link << "' for persistent volume " + << resource << " of container " << containerId; + + Try<Nothing> rm = os::rm(link); + if (rm.isError()) { + return Failure( + "Failed to remove the symlink for the unneeded " + "persistent volume at '" + link + "'"); + } + } + + // We then link additional persistent volumes. + foreach (const Resource& resource, resources.persistentVolumes()) { + // This is enforced by the master. + CHECK(resource.disk().has_volume()); + + // Ignore absolute and nested paths. + const string& containerPath = resource.disk().volume().container_path(); + if (strings::contains(containerPath, "/")) { + LOG(WARNING) << "Skipping updating symlink for persistent volume " + << resource << " of container " << containerId + << " because the container path '" << containerPath + << "' contains slash"; + continue; + } + + if (current.contains(resource)) { + continue; + } + + string original = paths::getPersistentVolumePath( + flags.work_dir, + resource.role(), + resource.disk().persistence().id()); + + // Set the ownership of the persistent volume to match that of the + // sandbox directory. + // + // NOTE: Currently, persistent volumes in Mesos are exclusive, + // meaning that if a persistent volume is used by one task or + // executor, it cannot be concurrently used by other task or + // executor. But if we allow multiple executors to use same + // persistent volume at the same time in the future, the ownership + // of the persistent volume may conflict here. + // + // TODO(haosdent): Consider letting the frameworks specify the + // user/group of the persistent volumes. + struct stat s; + if (::stat(info->directory.c_str(), &s) < 0) { + return Failure( + "Failed to get ownership for '" + info->directory + + "': " + strerror(errno)); + } + + LOG(INFO) << "Changing the ownership of the persistent volume at '" + << original << "' with uid " << s.st_uid + << " and gid " << s.st_gid; + + Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true); + if (chown.isError()) { + return Failure( + "Failed to change the ownership of the persistent volume at '" + + original + "' with uid " + stringify(s.st_uid) + + " and gid " + stringify(s.st_gid) + ": " + chown.error()); + } + + string link = path::join(info->directory, containerPath); + + LOG(INFO) << "Adding symlink from '" << original << "' to '" + << link << "' for persistent volume " << resource + << " of container " << containerId; + + Try<Nothing> symlink = ::fs::symlink(original, link); + if (symlink.isError()) { + return Failure( + "Failed to symlink persistent volume from '" + + original + "' to '" + link + "'"); + } + } + + // Store the updated resources. + info->resources = resources; + + return Nothing(); +} + + +Future<ResourceStatistics> PosixFilesystemIsolatorProcess::usage( + const ContainerID& containerId) +{ + // No-op, no usage gathered. + return ResourceStatistics(); +} + + +Future<Nothing> PosixFilesystemIsolatorProcess::cleanup( + const ContainerID& containerId) +{ + // Symlinks for persistent resources will be removed when the work + // directory is GC'ed, therefore no need to do explicit cleanup. + infos.erase(containerId); + + return Nothing(); +} + +} // namespace slave { +} // namespace internal { +} // namespace mesos { http://git-wip-us.apache.org/repos/asf/mesos/blob/efb73d34/src/slave/containerizer/isolators/filesystem/posix.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/filesystem/posix.hpp b/src/slave/containerizer/isolators/filesystem/posix.hpp new file mode 100644 index 0000000..16ba26f --- /dev/null +++ b/src/slave/containerizer/isolators/filesystem/posix.hpp @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __POSIX_FILESYSTEM_ISOLATOR_HPP__ +#define __POSIX_FILESYSTEM_ISOLATOR_HPP__ + +#include <mesos/slave/isolator.hpp> + +#include "slave/flags.hpp" + +namespace mesos { +namespace internal { +namespace slave { + +class PosixFilesystemIsolatorProcess : public mesos::slave::IsolatorProcess +{ +public: + static Try<mesos::slave::Isolator*> create(const Flags& flags); + + virtual ~PosixFilesystemIsolatorProcess(); + + virtual process::Future<Nothing> recover( + const std::list<mesos::slave::ExecutorRunState>& states, + const hashset<ContainerID>& orphans); + + virtual process::Future<Option<CommandInfo>> prepare( + const ContainerID& containerId, + const ExecutorInfo& executorInfo, + const std::string& directory, + const Option<std::string>& rootfs, + const Option<std::string>& user); + + virtual process::Future<Nothing> isolate( + const ContainerID& containerId, + pid_t pid); + + virtual process::Future<mesos::slave::Limitation> watch( + const ContainerID& containerId); + + virtual process::Future<Nothing> update( + const ContainerID& containerId, + const Resources& resources); + + virtual process::Future<ResourceStatistics> usage( + const ContainerID& containerId); + + virtual process::Future<Nothing> cleanup( + const ContainerID& containerId); + +private: + PosixFilesystemIsolatorProcess(const Flags& flags); + + const Flags flags; + + struct Info + { + explicit Info(const std::string& _directory) + : directory(_directory) {} + + const std::string directory; + + // Track resources so we can unlink unneeded persistent volumes. + Resources resources; + }; + + hashmap<ContainerID, process::Owned<Info>> infos; +}; + +} // namespace slave { +} // namespace internal { +} // namespace mesos { + +#endif // __POSIX_FILESYSTEM_ISOLATOR_HPP__ http://git-wip-us.apache.org/repos/asf/mesos/blob/efb73d34/src/slave/containerizer/mesos/containerizer.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp index d3a596d..609620c 100644 --- a/src/slave/containerizer/mesos/containerizer.cpp +++ b/src/slave/containerizer/mesos/containerizer.cpp @@ -44,14 +44,24 @@ #endif // __linux__ #include "slave/containerizer/isolators/posix.hpp" + #include "slave/containerizer/isolators/posix/disk.hpp" + #ifdef __linux__ #include "slave/containerizer/isolators/cgroups/cpushare.hpp" #include "slave/containerizer/isolators/cgroups/mem.hpp" #include "slave/containerizer/isolators/cgroups/perf_event.hpp" +#endif + +#include "slave/containerizer/isolators/filesystem/posix.hpp" +#ifdef __linux__ #include "slave/containerizer/isolators/filesystem/shared.hpp" +#endif + +#ifdef __linux__ #include "slave/containerizer/isolators/namespaces/pid.hpp" -#endif // __linux__ +#endif + #ifdef WITH_NETWORK_ISOLATOR #include "slave/containerizer/isolators/network/port_mapping.hpp" #endif @@ -104,6 +114,12 @@ Try<MesosContainerizer*> MesosContainerizer::create( isolation = flags.isolation; } + // A filesystem isolator is required for persistent volumes; use the + // filesystem/posix isolator if necessary. + if (!strings::contains(isolation, "filesystem/")) { + isolation += ",filesystem/posix"; + } + // Modify the flags to include any changes to isolation. Flags flags_ = flags; flags_.isolation = isolation; @@ -112,6 +128,7 @@ Try<MesosContainerizer*> MesosContainerizer::create( // Create a MesosContainerizerProcess using isolators and a launcher. static const hashmap<string, Try<Isolator*> (*)(const Flags&)> creators = { + {"filesystem/posix", &PosixFilesystemIsolatorProcess::create}, {"posix/cpu", &PosixCpuIsolatorProcess::create}, {"posix/mem", &PosixMemIsolatorProcess::create}, {"posix/disk", &PosixDiskIsolatorProcess::create}, @@ -565,24 +582,10 @@ Future<bool> MesosContainerizerProcess::launch( container->executorInfo = executorInfo; container->directory = directory; container->state = PREPARING; - containers_[containerId] = Owned<Container>(container); - - // Prepare volumes for the container. - // TODO(jieyu): Consider decoupling file system isolation from - // runtime isolation. The existing isolators are actually for - // runtime isolation. For file system isolation, the interface might - // be different and we always need a file system isolator. The - // following logic should be moved to the file system isolator. - Try<Nothing> update = updateVolumes(containerId, executorInfo.resources()); - if (update.isError()) { - return Failure("Failed to prepare volumes: " + update.error()); - } - - // NOTE: We do not update 'container->resources' until volumes are - // prepared because 'updateVolumes' above depends on the current - // container resources. container->resources = executorInfo.resources(); + containers_.put(containerId, Owned<Container>(container)); + return provision(containerId, executorInfo, slaveId, directory, checkpoint) .then(defer(self(), &Self::prepare, @@ -1011,13 +1014,6 @@ Future<Nothing> MesosContainerizerProcess::update( return Nothing(); } - // Update volumes for the container. - // TODO(jieyu): See comments above 'updateVolumes' in 'launch'. - Try<Nothing> update = updateVolumes(containerId, resources); - if (update.isError()) { - return Failure("Failed to update volumes: " + update.error()); - } - // NOTE: We update container's resources before isolators are updated // so that subsequent containerizer->update can be handled properly. container->resources = resources; @@ -1412,131 +1408,6 @@ MesosContainerizerProcess::Metrics::~Metrics() } -Try<Nothing> MesosContainerizerProcess::updateVolumes( - const ContainerID& containerId, - const Resources& updated) -{ - CHECK(containers_.contains(containerId)); - const Owned<Container>& container = containers_[containerId]; - - // TODO(jieyu): Currently, we only allow non-nested relative - // container paths for volumes. This is enforced by the master. For - // those volumes, we create symlinks in the executor directory. No - // need to proceed if the container change the file system root - // because the symlinks will become invalid if the file system root - // is changed. Consider moving this logic to the file system - // isolator and let the file system isolator decide how to mount - // those volumes (by either creating symlinks or doing bind mounts). - if (container->rootfs.isSome()) { - LOG(WARNING) << "Cannot update volumes for container " << containerId - << " because it changes the file system root"; - return Nothing(); - } - - Resources current = container->resources; - - // We first remove unneeded persistent volumes. - foreach (const Resource& resource, current.persistentVolumes()) { - // This is enforced by the master. - CHECK(resource.disk().has_volume()); - - // Ignore absolute and nested paths. - const string& containerPath = resource.disk().volume().container_path(); - if (strings::contains(containerPath, "/")) { - LOG(WARNING) << "Skipping updating symlink for persistent volume " - << resource << " of container " << containerId - << " because the container path '" << containerPath - << "' contains slash"; - continue; - } - - if (updated.contains(resource)) { - continue; - } - - string link = path::join(container->directory, containerPath); - - LOG(INFO) << "Removing symlink '" << link << "' for persistent volume " - << resource << " of container " << containerId; - - Try<Nothing> rm = os::rm(link); - if (rm.isError()) { - return Error( - "Failed to remove the symlink for the unneeded " - "persistent volume at '" + link + "'"); - } - } - - // We then link additional persistent volumes. - foreach (const Resource& resource, updated.persistentVolumes()) { - // This is enforced by the master. - CHECK(resource.disk().has_volume()); - - // Ignore absolute and nested paths. - const string& containerPath = resource.disk().volume().container_path(); - if (strings::contains(containerPath, "/")) { - LOG(WARNING) << "Skipping updating symlink for persistent volume " - << resource << " of container " << containerId - << " because the container path '" << containerPath - << "' contains slash"; - continue; - } - - if (current.contains(resource)) { - continue; - } - - string link = path::join(container->directory, containerPath); - - string original = paths::getPersistentVolumePath( - flags.work_dir, - resource.role(), - resource.disk().persistence().id()); - - LOG(INFO) << "Adding symlink from '" << original << "' to '" - << link << "' for persistent volume " << resource - << " of container " << containerId; - - Try<Nothing> symlink = fs::symlink(original, link); - if (symlink.isError()) { - return Error( - "Failed to symlink persistent volume from '" + - original + "' to '" + link + "'"); - } - - // Set the ownership of persistent volume to match the sandbox - // directory. Currently, persistent volumes in mesos are - // exclusive. If one persistent volume is used by one - // task/executor, it cannot be concurrently used by other - // task/executor. But if we allow multiple executors use same - // persistent volume at the same time in the future, the ownership - // of persistent volume may conflict here. - // TODO(haosdent): We need to update this after we have a proposed - // plan to adding user/group to persistent volumes. - struct stat s; - if (::stat(container->directory.c_str(), &s) < 0) { - return Error("Failed to get permissions on '" + container->directory + - "': " + strerror(errno)); - } - - Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true); - if (chown.isError()) { - return Error("Failed to chown persistent volume '" + original + - "': " + chown.error()); - } - } - - return Nothing(); -} - - -static list<Future<Nothing>> __cleanupIsolators( - const list<Future<Nothing>>& cleanups) -{ - return cleanups; -} - - static Future<list<Future<Nothing>>> _cleanupIsolators( const Owned<Isolator>& isolator, const ContainerID& containerId, @@ -1553,7 +1424,7 @@ static Future<list<Future<Nothing>>> _cleanupIsolators( cleanup_.push_back(cleanup); return await(cleanup_) - .then(lambda::bind(&__cleanupIsolators, cleanups)); + .then([cleanups]() -> list<Future<Nothing>> { return cleanups; }); } http://git-wip-us.apache.org/repos/asf/mesos/blob/efb73d34/src/slave/containerizer/mesos/containerizer.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/containerizer.hpp b/src/slave/containerizer/mesos/containerizer.hpp index a8c71d6..f6c580d 100644 --- a/src/slave/containerizer/mesos/containerizer.hpp +++ b/src/slave/containerizer/mesos/containerizer.hpp @@ -268,12 +268,6 @@ private: // destroy. void reaped(const ContainerID& containerId); - // Updates volumes for the given container according to its current - // resources and the given updated resources. - Try<Nothing> updateVolumes( - const ContainerID& containerId, - const Resources& updated); - // TODO(jieyu): Consider introducing an Isolators struct and moving // all isolator related operations to that struct. process::Future<std::list<process::Future<Nothing>>> cleanupIsolators(
