This is an automated email from the ASF dual-hosted git repository. gilbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 221efd311f241575ebf3170de663cd301d61e252 Author: Qian Zhang <[email protected]> AuthorDate: Wed Feb 27 22:22:04 2019 -0800 Added volume gid manager. This manager is used to allocate/deallocate gids for shared persistent volumes and PARENT type SANDBOX_PATH volumes. Review: https://reviews.apache.org/r/69675/ --- src/CMakeLists.txt | 4 +- src/Makefile.am | 10 +- src/slave/volume_gid_manager/state.hpp | 23 ++ src/slave/volume_gid_manager/state.proto | 41 +++ .../volume_gid_manager/volume_gid_manager.cpp | 371 +++++++++++++++++++++ .../volume_gid_manager/volume_gid_manager.hpp | 64 ++++ 6 files changed, 510 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 687dc85..3397c3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -85,6 +85,7 @@ PROTOC_GENERATE(INTERNAL TARGET slave/containerizer/mesos/isolators/network/cni/ PROTOC_GENERATE(INTERNAL TARGET slave/containerizer/mesos/isolators/docker/volume/state) PROTOC_GENERATE(INTERNAL TARGET slave/containerizer/mesos/provisioner/docker/message) PROTOC_GENERATE(INTERNAL TARGET slave/state) +PROTOC_GENERATE(INTERNAL TARGET slave/volume_gid_manager/state) PROTOC_GENERATE(INTERNAL TARGET master/registry) PROTOC_GENERATE(INTERNAL TARGET resource_provider/registry) PROTOC_GENERATE(INTERNAL TARGET resource_provider/state) @@ -198,7 +199,8 @@ if (NOT WIN32) slave/containerizer/mesos/isolators/posix/disk.cpp slave/containerizer/mesos/isolators/posix/rlimits.cpp slave/containerizer/mesos/isolators/volume/sandbox_path.cpp - slave/containerizer/mesos/provisioner/utils.cpp) + slave/containerizer/mesos/provisioner/utils.cpp + slave/volume_gid_manager/volume_gid_manager.cpp) endif () set(APPC_SRC diff --git a/src/Makefile.am b/src/Makefile.am index 283d5ed..6bf2b97 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -414,7 +414,9 @@ CXX_PROTOS += \ slave/containerizer/mesos/isolators/docker/volume/state.pb.cc \ slave/containerizer/mesos/isolators/docker/volume/state.pb.h \ slave/containerizer/mesos/isolators/network/cni/spec.pb.cc \ - slave/containerizer/mesos/isolators/network/cni/spec.pb.h + slave/containerizer/mesos/isolators/network/cni/spec.pb.h \ + slave/volume_gid_manager/state.pb.cc \ + slave/volume_gid_manager/state.pb.h CXX_PROTOS += \ resource_provider/storage/disk_profile.pb.cc \ @@ -1002,7 +1004,8 @@ libmesos_no_3rdparty_la_SOURCES = \ slave/state.proto \ slave/containerizer/mesos/provisioner/docker/message.proto \ slave/containerizer/mesos/isolators/docker/volume/state.proto \ - slave/containerizer/mesos/isolators/network/cni/spec.proto + slave/containerizer/mesos/isolators/network/cni/spec.proto \ + slave/volume_gid_manager/state.proto # TODO(tillt): Remove authentication/cram_md5/* which will enable us to # lose the immediate cyrus-sasl2 dependency. @@ -1287,6 +1290,9 @@ libmesos_no_3rdparty_la_SOURCES += \ slave/task_status_update_manager.hpp \ slave/validation.cpp \ slave/validation.hpp \ + slave/volume_gid_manager/state.hpp \ + slave/volume_gid_manager/volume_gid_manager.cpp \ + slave/volume_gid_manager/volume_gid_manager.hpp \ slave/windows_ctrlhandler.hpp \ status_update_manager/operation.cpp \ status_update_manager/operation.hpp \ diff --git a/src/slave/volume_gid_manager/state.hpp b/src/slave/volume_gid_manager/state.hpp new file mode 100644 index 0000000..8b8d78c --- /dev/null +++ b/src/slave/volume_gid_manager/state.hpp @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __VOLUME_GID_MANAGER_STATE_HPP__ +#define __VOLUME_GID_MANAGER_STATE_HPP__ + +// ONLY USEFUL AFTER RUNNING PROTOC. +#include "slave/volume_gid_manager/state.pb.h" + +#endif // __VOLUME_GID_MANAGER_STATE_HPP__ diff --git a/src/slave/volume_gid_manager/state.proto b/src/slave/volume_gid_manager/state.proto new file mode 100644 index 0000000..e6a31d3 --- /dev/null +++ b/src/slave/volume_gid_manager/state.proto @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mesos.internal.slave; + +message VolumeGidInfo { + enum Type { + UNKNOWN = 0; + PERSISTENT = 1; + SANDBOX_PATH = 2; + } + + // The type of the volume. + optional Type type = 1; + + // The source path of the volume. + required string path = 2; + + // The gid allocated to the volume + required uint32 gid = 3; +} + + +message VolumeGidInfos { + repeated VolumeGidInfo infos = 1; +} diff --git a/src/slave/volume_gid_manager/volume_gid_manager.cpp b/src/slave/volume_gid_manager/volume_gid_manager.cpp new file mode 100644 index 0000000..ed8f6a2 --- /dev/null +++ b/src/slave/volume_gid_manager/volume_gid_manager.cpp @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fts.h> +#include <sys/types.h> + +#include <string> + +#include <mesos/resources.hpp> + +#include <process/dispatch.hpp> +#include <process/id.hpp> + +#include <stout/os/su.hpp> + +#include "common/values.hpp" + +#include "slave/volume_gid_manager/volume_gid_manager.hpp" + +using std::string; +using std::vector; + +using process::dispatch; +using process::Failure; +using process::Future; +using process::Owned; + +using mesos::internal::values::rangesToIntervalSet; + +namespace mesos { +namespace internal { +namespace slave { + +// Recursively change the owner group of the given path +// to the given gid and set/unset the `setgid` bit. +static Try<Nothing> setVolumeOwnership( + const string& path, + gid_t gid, + bool setgid) +{ + LOG(INFO) << "Start setting the owner group of the volume path '" + << path << "' " << (setgid ? "" : "back ") << "to " << gid; + + char* path_[] = {const_cast<char*>(path.c_str()), nullptr}; + + FTS* tree = ::fts_open(path_, FTS_NOCHDIR | FTS_PHYSICAL, nullptr); + if (tree == nullptr) { + return ErrnoError("Failed to open '" + path + "'"); + } + + FTSENT *node; + while ((node = ::fts_read(tree)) != nullptr) { + const Path path = Path(node->fts_path); + + switch (node->fts_info) { + // Preorder directory. + case FTS_D: + // Regular file. + case FTS_F: + // Symbolic link. + case FTS_SL: { + CHECK_NOTNULL(node->fts_statp); + + // Change the owner group to the given gid. + if (::lchown(node->fts_path, node->fts_statp->st_uid, gid) < 0) { + Error error = ErrnoError(); + ::fts_close(tree); + return Error( + "Chown failed on '" + path.string() + "': " + error.message); + } + + if (node->fts_info == FTS_D) { + // Set the `setgid` bit for directories and add the write + // permission for the owner group. + if (setgid) { + if (::chmod( + node->fts_path, + node->fts_statp->st_mode | S_ISGID | S_IWGRP)) { + Error error = ErrnoError(); + ::fts_close(tree); + return Error( + "Chmod failed on '" + path.string() + "': " + error.message); + } + } else { + // Unset the `setgid` bit for directories and remove the write + // permission for the owner group. + if (::chmod( + node->fts_path, + node->fts_statp->st_mode & ~S_ISGID & ~S_IWGRP)) { + Error error = ErrnoError(); + ::fts_close(tree); + return Error( + "Chmod failed on '" + path.string() + "': " + error.message); + } + } + } + + break; + } + + // Unreadable directory. + case FTS_DNR: + // Error; errno is set. + case FTS_ERR: + // `stat(2)` failed. + case FTS_NS: { + Error error = ErrnoError(node->fts_errno); + ::fts_close(tree); + return Error( + "Failed to read '" + path.string() + "': " + error.message); + } + + default: + break; + } + } + + if (errno != 0) { + Error error = ErrnoError(); + ::fts_close(tree); + return error; + } + + if (::fts_close(tree) != 0) { + return ErrnoError("Failed to stop traversing file system"); + } + + LOG(INFO) << "Finished setting the owner group of the volume path '" + << path << "' " << (setgid ? "" : "back ") << "to " << gid; + + return Nothing(); +} + + +class VolumeGidManagerProcess : public process::Process<VolumeGidManagerProcess> +{ +public: + VolumeGidManagerProcess(const IntervalSet<gid_t>& gids) + : ProcessBase(process::ID::generate("volume-gid-manager")), + totalGids(gids), + freeGids(gids) {} + + // This method will be called when a container running as non-root user tries + // to use a shared persistent volume or a PARENT type SANDBOX_PATH volume, the + // parameter `path` will be the source path of the volume. + Future<gid_t> allocate(const string& path, VolumeGidInfo::Type type) + { + gid_t gid; + + // If a gid has already been allocated for the specified path, + // just return the gid. + if (infos.contains(path)) { + gid = infos[path].gid(); + + LOG(INFO) << "Use the allocated gid " << gid << " of the volume path '" + << path << "'"; + } else { + // Allocate a free gid to the specified path and then set the + // ownership for it. + if (freeGids.empty()) { + return Failure( + "Failed to allocate gid to the volume path '" + path + + "' because the free gid range is exhausted"); + } + + gid = freeGids.begin()->lower(); + + LOG(INFO) << "Allocating gid " << gid << " to the volume path '" + << path << "'"; + + Try<Nothing> result = setVolumeOwnership(path, gid, true); + if (result.isError()) { + return Failure( + "Failed to set the owner group of the volume path '" + path + + "' to " + stringify(gid) + ": " + result.error()); + } + + freeGids -= gid; + + VolumeGidInfo info; + info.set_type(type); + info.set_path(path); + info.set_gid(gid); + + infos.put(path, info); + } + + return gid; + } + + // This method will be called in two cases: + // 1. When a shared persistent volume is destroyed by agent, the parameter + // `path` will be the shared persistent volume's path. + // 2. When a container is destroyed by containerizer, the parameter `path` + // will be the container's sandbox path. + // We search if the given path is contained in `infos` (for the case 1) or is + // the parent directory of any volume paths in `infos` (for the case 2, i.e., + // the PARENT type SANDBOX_PATH volume must be a subdirectory in the parent + // container's sandbox) and then free the allocated gid for the found path(s). + Future<Nothing> deallocate(const string& path) + { + vector<string> sandboxPathVolumes; + + for (auto it = infos.begin(); it != infos.end(); ) { + const VolumeGidInfo& info = it->second; + const string& volumePath = info.path(); + + if (strings::startsWith(volumePath, path)) { + if (volumePath != path) { + // This is the case of the PARENT type SANDBOX_PATH volume. + sandboxPathVolumes.push_back(volumePath); + } + + gid_t gid = info.gid(); + + LOG(INFO) << "Deallocated gid " << gid << " for the volume path '" + << volumePath << "'"; + + // Only return the gid to the free range if it is in the total + // range. The gid may not be in the total range in the case that + // Mesos agent is restarted with a different total range and we + // deallocate gid for a previous volume path from the old range. + if (totalGids.contains(gid)) { + freeGids += gid; + } + + it = infos.erase(it); + } else { + ++it; + } + } + + // For the PARENT type SANDBOX_PATH volume, it will exist for a while + // (depending on GC policy) after the container is destroyed. So to + // avoid leaking it to other containers in the case that its gid is + // allocated to another volume, we need to change its owner group back + // to the original one (i.e., the primary group of its owner). + foreach (const string& volume, sandboxPathVolumes) { + // Get the uid of the volume's owner. + struct stat s; + if (::stat(volume.c_str(), &s) < 0) { + LOG(WARNING) << "Failed to stat '" << volume << "': " + << os::strerror(errno); + + continue; + } + + Result<string> user = os::user(s.st_uid); + if (!user.isSome()) { + LOG(WARNING) << "Failed to get username for the uid " << s.st_uid + << ": " << (user.isError() ? user.error() : "not found"); + + continue; + } + + // Get the primary group ID of the user. + Result<gid_t> gid = os::getgid(user.get()); + if (!gid.isSome()) { + LOG(WARNING) << "Failed to get gid for the user '" << user.get() + << "': " << (gid.isError() ? gid.error() : "not found"); + + continue; + } + + Try<Nothing> result = setVolumeOwnership(volume, gid.get(), false); + if (result.isError()) { + LOG(WARNING) << "Failed to set the owner group of the volume path '" + << volume << "' back to " << gid.get() << ": " + << result.error(); + } + } + + return Nothing(); + } + +private: + const IntervalSet<gid_t> totalGids; + IntervalSet<gid_t> freeGids; + + // Allocated gid infos keyed by the volume path. + hashmap<string, VolumeGidInfo> infos; +}; + + +Try<VolumeGidManager*> VolumeGidManager::create(const Flags& flags) +{ + if (geteuid() != 0) { + return Error("Volume gid manager requires root privileges"); + } + + CHECK_SOME(flags.volume_gid_range); + + Try<Resource> parse = + Resources::parse("gids", flags.volume_gid_range.get(), "*"); + + if (parse.isError()) { + return Error( + "Failed to parse volume gid range '" + + flags.volume_gid_range.get() + "'"); + } + + if (parse->type() != Value::RANGES) { + return Error( + "Invalid volume gid range type " + + mesos::Value_Type_Name(parse->type()) + + ", expecting " + + mesos::Value_Type_Name(Value::RANGES)); + } + + Try<IntervalSet<gid_t>> gids = + rangesToIntervalSet<gid_t>(parse->ranges()); + + if (gids.isError()) { + return Error("Invalid volume gid range '" + + stringify(parse->ranges()) + "': " + gids.error()); + } else if (gids->empty()) { + return Error("Empty volume gid range"); + } + + return new VolumeGidManager( + Owned<VolumeGidManagerProcess>(new VolumeGidManagerProcess(gids.get()))); +} + + +VolumeGidManager::VolumeGidManager( + const Owned<VolumeGidManagerProcess>& _process) + : process(_process) +{ + spawn(process.get()); +} + + +VolumeGidManager::~VolumeGidManager() +{ + terminate(process.get()); + process::wait(process.get()); +} + + +Future<gid_t> VolumeGidManager::allocate( + const string& path, + VolumeGidInfo::Type type) const +{ + return dispatch(process.get(), + &VolumeGidManagerProcess::allocate, + path, + type); +} + + +Future<Nothing> VolumeGidManager::deallocate(const string& path) const +{ + return dispatch(process.get(), &VolumeGidManagerProcess::deallocate, path); +} + +} // namespace slave { +} // namespace internal { +} // namespace mesos { diff --git a/src/slave/volume_gid_manager/volume_gid_manager.hpp b/src/slave/volume_gid_manager/volume_gid_manager.hpp new file mode 100644 index 0000000..51732af --- /dev/null +++ b/src/slave/volume_gid_manager/volume_gid_manager.hpp @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __VOLUME_GID_MANAGER_HPP__ +#define __VOLUME_GID_MANAGER_HPP__ + +#include <process/future.hpp> +#include <process/process.hpp> + +#include <stout/nothing.hpp> +#include <stout/try.hpp> + +#include "slave/flags.hpp" + +#include "slave/volume_gid_manager/state.hpp" + +namespace mesos { +namespace internal { +namespace slave { + +// Forward declaration. +class VolumeGidManagerProcess; + + +// Manages the allocation of owner group IDs for shared +// persistent volumes and SANDBOX_PATH volume of PARENT type. +class VolumeGidManager +{ +public: + static Try<VolumeGidManager*> create(const Flags& flags); + + ~VolumeGidManager(); + + process::Future<gid_t> allocate( + const std::string& path, + VolumeGidInfo::Type type) const; + + process::Future<Nothing> deallocate(const std::string& path) const; + +private: + explicit VolumeGidManager( + const process::Owned<VolumeGidManagerProcess>& process); + + process::Owned<VolumeGidManagerProcess> process; +}; + +} // namespace slave { +} // namespace internal { +} // namespace mesos { + +#endif // __VOLUME_GID_MANAGER_HPP__
