This is an automated email from the ASF dual-hosted git repository. gilbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 3b7670ee72ae215faf2ad012e92fc344fc540baa Author: Qian Zhang <[email protected]> AuthorDate: Wed Feb 27 22:22:21 2019 -0800 Made non-root containers can access shared persistent volume. Review: https://reviews.apache.org/r/69544/ --- src/slave/containerizer/mesos/containerizer.cpp | 80 +++++++---- .../mesos/isolators/filesystem/linux.cpp | 146 +++++++++++++------ .../mesos/isolators/filesystem/linux.hpp | 12 +- .../mesos/isolators/filesystem/posix.cpp | 157 +++++++++++++++------ .../mesos/isolators/filesystem/posix.hpp | 13 +- .../mesos/isolators/filesystem/windows.cpp | 11 +- .../mesos/isolators/filesystem/windows.hpp | 11 +- .../volume_gid_manager/volume_gid_manager.cpp | 79 ++++++++--- 8 files changed, 367 insertions(+), 142 deletions(-) diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp index a30c00a..0432448 100644 --- a/src/slave/containerizer/mesos/containerizer.cpp +++ b/src/slave/containerizer/mesos/containerizer.cpp @@ -371,13 +371,29 @@ Try<MesosContainerizer*> MesosContainerizer::create( // Filesystem isolators. #ifdef __WINDOWS__ - {"filesystem/windows", &WindowsFilesystemIsolatorProcess::create}, + {"filesystem/windows", + [volumeGidManager] (const Flags& flags) -> Try<Isolator*> { + return WindowsFilesystemIsolatorProcess::create( + flags, + volumeGidManager); + }}, #else - {"filesystem/posix", &PosixFilesystemIsolatorProcess::create}, + {"filesystem/posix", + [volumeGidManager] (const Flags& flags) -> Try<Isolator*> { + return PosixFilesystemIsolatorProcess::create( + flags, + volumeGidManager); + }}, #endif // __WINDOWS__ #ifdef __linux__ - {"filesystem/linux", &LinuxFilesystemIsolatorProcess::create}, + {"filesystem/linux", + [volumeGidManager] (const Flags& flags) -> Try<Isolator*> { + return LinuxFilesystemIsolatorProcess::create( + flags, + volumeGidManager); + }}, + // TODO(jieyu): Deprecate this in favor of using filesystem/linux. {"filesystem/shared", &SharedFilesystemIsolatorProcess::create}, #endif // __linux__ @@ -1715,35 +1731,49 @@ Future<Containerizer::LaunchResult> MesosContainerizerProcess::_launch( } // For command tasks specifically, we should add the task_environment - // flag to the launch command of the command executor. + // flag and the task_supplementary_groups flag to the launch command + // of the command executor. // TODO(tillt): Remove this once we no longer support the old style // command task (i.e., that uses mesos-execute). - if (container->config->has_task_info() && launchInfo.has_task_environment()) { - hashmap<string, string> commandTaskEnvironment; + if (container->config->has_task_info()) { + if (launchInfo.has_task_environment()) { + hashmap<string, string> commandTaskEnvironment; + + foreach (const Environment::Variable& variable, + launchInfo.task_environment().variables()) { + const string& name = variable.name(); + const string& value = variable.value(); + if (commandTaskEnvironment.contains(name) && + commandTaskEnvironment[name] != value) { + LOG(WARNING) << "Overwriting environment variable '" << name << "' " + << "for container " << containerId; + } + // TODO(tillt): Consider making this 'secret' aware. + commandTaskEnvironment[name] = value; + } - foreach (const Environment::Variable& variable, - launchInfo.task_environment().variables()) { - const string& name = variable.name(); - const string& value = variable.value(); - if (commandTaskEnvironment.contains(name) && - commandTaskEnvironment[name] != value) { - LOG(WARNING) << "Overwriting environment variable '" << name << "' " - << "for container " << containerId; + if (!commandTaskEnvironment.empty()) { + Environment taskEnvironment; + foreachpair ( + const string& name, const string& value, commandTaskEnvironment) { + Environment::Variable* variable = taskEnvironment.add_variables(); + variable->set_name(name); + variable->set_value(value); + } + launchInfo.mutable_command()->add_arguments( + "--task_environment=" + stringify(JSON::protobuf(taskEnvironment))); } - // TODO(tillt): Consider making this 'secret' aware. - commandTaskEnvironment[name] = value; } - if (!commandTaskEnvironment.empty()) { - Environment taskEnvironment; - foreachpair ( - const string& name, const string& value, commandTaskEnvironment) { - Environment::Variable* variable = taskEnvironment.add_variables(); - variable->set_name(name); - variable->set_value(value); - } + if (launchInfo.task_supplementary_groups_size() > 0) { + // Remove duplicated entries in supplementary groups. + set<uint32_t> taskSupplementaryGroups( + launchInfo.task_supplementary_groups().begin(), + launchInfo.task_supplementary_groups().end()); + launchInfo.mutable_command()->add_arguments( - "--task_environment=" + stringify(JSON::protobuf(taskEnvironment))); + "--task_supplementary_groups=" + + strings::join(",", taskSupplementaryGroups)); } } diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp index 1e116b7..341853a 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp @@ -37,9 +37,9 @@ #include <stout/stringify.hpp> #include <stout/strings.hpp> +#include <stout/os/realpath.hpp> #include <stout/os/shell.hpp> #include <stout/os/strerror.hpp> -#include <stout/os/realpath.hpp> #include "common/protobuf_utils.hpp" @@ -451,7 +451,9 @@ static Try<Nothing> ensureAllowDevices(const string& _targetDir) } -Try<Isolator*> LinuxFilesystemIsolatorProcess::create(const Flags& flags) +Try<Isolator*> LinuxFilesystemIsolatorProcess::create( + const Flags& flags, + VolumeGidManager* volumeGidManager) { if (geteuid() != 0) { return Error("'filesystem/linux' isolator requires root privileges"); @@ -506,16 +508,18 @@ Try<Isolator*> LinuxFilesystemIsolatorProcess::create(const Flags& flags) } Owned<MesosIsolatorProcess> process( - new LinuxFilesystemIsolatorProcess(flags)); + new LinuxFilesystemIsolatorProcess(flags, volumeGidManager)); return new MesosIsolator(process); } LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess( - const Flags& _flags) + const Flags& _flags, + VolumeGidManager* _volumeGidManager) : ProcessBase(process::ID::generate("linux-filesystem-isolator")), flags(_flags), + volumeGidManager(_volumeGidManager), metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {} @@ -748,9 +752,29 @@ Future<Option<ContainerLaunchInfo>> LinuxFilesystemIsolatorProcess::prepare( } return update(containerId, containerConfig.resources()) - .then([launchInfo]() -> Future<Option<ContainerLaunchInfo>> { - return launchInfo; - }); + .then(defer( + self(), + [this, containerId, containerConfig, launchInfo]() mutable + -> Future<Option<ContainerLaunchInfo>> { + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + foreach (gid_t gid, infos[containerId]->gids) { + // For command task with its own rootfs, the command executor will + // run as root and the task itself will run as the specified normal + // user, so here we add the supplementary group for the task and the + // command executor will set it accordingly when launching the task. + if (containerConfig.has_task_info() && + containerConfig.has_rootfs()) { + launchInfo.add_task_supplementary_groups(gid); + } else { + launchInfo.add_supplementary_groups(gid); + } + } + + return launchInfo; + })); } @@ -826,6 +850,8 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::update( const uid_t uid = s.st_uid; const gid_t gid = s.st_gid; + vector<Future<gid_t>> futures; + // We then mount new persistent volumes. foreach (const Resource& resource, resources.persistentVolumes()) { // This is enforced by the master. @@ -848,42 +874,68 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::update( // Determine the source of the mount. string source = paths::getPersistentVolumePath(flags.work_dir, resource); - bool isVolumeInUse = false; - - foreachpair (const ContainerID& _containerId, - const Owned<Info>& info, - infos) { - // Skip self. - if (_containerId == containerId) { - continue; - } - - if (info->resources.contains(resource)) { - isVolumeInUse = true; - break; - } - } + // If the container's user is root (uid == 0), we do not need to do any + // changes about the volume's ownership since it has the full permissions + // to access the volume. + if (uid != 0) { + // For shared persistent volume, if volume gid manager is enabled, call + // volume gid manager to allocate a gid to make sure the container has + // the permission to access the volume. + // + // TODO(qianzhang): Support gid allocation for shared persistent volumes + // from resource providers. + if (resource.has_shared() && + !Resources::hasResourceProvider(resource) && + volumeGidManager) { + LOG(INFO) << "Invoking volume gid manager to allocate gid to the " + << "volume path '" << source << "' for container " + << containerId; + + futures.push_back( + volumeGidManager->allocate(source, VolumeGidInfo::PERSISTENT)); + } else { + bool isVolumeInUse = false; + + // Check if the shared persistent volume is currently used by another + // container. We do not need to do this check for local persistent + // volume since it can only be used by one container at a time. + if (resource.has_shared()) { + foreachpair (const ContainerID& _containerId, + const Owned<Info>& info, + infos) { + // Skip self. + if (_containerId == containerId) { + continue; + } + + if (info->resources.contains(resource)) { + isVolumeInUse = true; + break; + } + } + } - // Set the ownership of the persistent volume to match that of the sandbox - // directory if the volume is not already in use. If the volume is - // currently in use by other containers, tasks in this container may fail - // to read from or write to the persistent volume due to incompatible - // ownership and file system permissions. - if (!isVolumeInUse) { - LOG(INFO) << "Changing the ownership of the persistent volume at '" - << source << "' with uid " << uid << " and gid " << gid; - - Try<Nothing> chown = os::chown(uid, gid, source, false); - - if (chown.isError()) { - return Failure( - "Failed to change the ownership of the persistent volume at '" + - source + "' with uid " + stringify(uid) + - " and gid " + stringify(gid) + ": " + chown.error()); + // Set the ownership of the persistent volume to match that of the + // sandbox directory if the volume is not already in use. If the + // volume is currently in use by other containers, tasks in this + // container may fail to read from or write to the persistent volume + // due to incompatible ownership and file system permissions. + if (!isVolumeInUse) { + LOG(INFO) << "Changing the ownership of the persistent volume at '" + << source << "' with uid " << uid << " and gid " << gid; + + Try<Nothing> chown = os::chown(uid, gid, source, false); + if (chown.isError()) { + return Failure( + "Failed to change the ownership of the persistent volume at '" + + source + "' with uid " + stringify(uid) + + " and gid " + stringify(gid) + ": " + chown.error()); + } + } else { + LOG(INFO) << "Leaving the ownership of the persistent volume at '" + << source << "' unchanged because it is in use"; + } } - } else { - LOG(INFO) << "Leaving the ownership of the persistent volume at '" - << source << "' unchanged because it is in use"; } // Determine the target of the mount. @@ -956,7 +1008,17 @@ Future<Nothing> LinuxFilesystemIsolatorProcess::update( // Store the new resources; info->resources = resources; - return Nothing(); + return collect(futures) + .then(defer(self(), [this, containerId](const vector<gid_t>& gids) + -> Future<Nothing> { + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + infos[containerId]->gids = gids; + + return Nothing(); + })); } diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp index 8f76944..e6fe688 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp @@ -31,6 +31,8 @@ #include "slave/containerizer/mesos/isolator.hpp" +#include "slave/volume_gid_manager/volume_gid_manager.hpp" + namespace mesos { namespace internal { namespace slave { @@ -42,7 +44,9 @@ namespace slave { class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess { public: - static Try<mesos::slave::Isolator*> create(const Flags& flags); + static Try<mesos::slave::Isolator*> create( + const Flags& flags, + VolumeGidManager* volumeGidManager = nullptr); ~LinuxFilesystemIsolatorProcess() override; @@ -65,9 +69,12 @@ public: const ContainerID& containerId) override; private: - LinuxFilesystemIsolatorProcess(const Flags& flags); + LinuxFilesystemIsolatorProcess( + const Flags& flags, + VolumeGidManager* volumeGidManager); const Flags flags; + VolumeGidManager* volumeGidManager; struct Info { @@ -84,6 +91,7 @@ private: Resources resources; Option<ExecutorInfo> executor; + std::vector<gid_t> gids; }; hashmap<ContainerID, process::Owned<Info>> infos; diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp index f91a2ee..08449e2 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp @@ -17,6 +17,7 @@ #include <string> #include <vector> +#include <process/collect.hpp> #include <process/id.hpp> #include <stout/fs.hpp> @@ -45,18 +46,22 @@ namespace internal { namespace slave { PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess( - const Flags& _flags) + const Flags& _flags, + VolumeGidManager* _volumeGidManager) : ProcessBase(process::ID::generate("posix-filesystem-isolator")), - flags(_flags) {} + flags(_flags), + volumeGidManager(_volumeGidManager) {} PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {} -Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags) +Try<Isolator*> PosixFilesystemIsolatorProcess::create( + const Flags& flags, + VolumeGidManager* volumeGidManager) { process::Owned<MesosIsolatorProcess> process( - new PosixFilesystemIsolatorProcess(flags)); + new PosixFilesystemIsolatorProcess(flags, volumeGidManager)); return new MesosIsolator(process); } @@ -101,7 +106,30 @@ Future<Option<ContainerLaunchInfo>> PosixFilesystemIsolatorProcess::prepare( infos.put(containerId, Owned<Info>(new Info(containerConfig.directory()))); return update(containerId, executorInfo.resources()) - .then([]() -> Future<Option<ContainerLaunchInfo>> { return None(); }); + .then(defer( + self(), + [this, containerId, containerConfig]() mutable + -> Future<Option<ContainerLaunchInfo>> { + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + ContainerLaunchInfo launchInfo; + foreach (gid_t gid, infos[containerId]->gids) { + // For command task with its own rootfs, the command executor will + // run as root and the task itself will run as the specified normal + // user, so here we add the supplementary group for the task and the + // command executor will set it accordingly when launching the task. + if (containerConfig.has_task_info() && + containerConfig.has_rootfs()) { + launchInfo.add_task_supplementary_groups(gid); + } else { + launchInfo.add_supplementary_groups(gid); + } + } + + return launchInfo; + })); } @@ -162,6 +190,8 @@ Future<Nothing> PosixFilesystemIsolatorProcess::update( const uid_t uid = s.st_uid; const gid_t gid = s.st_gid; + vector<Future<gid_t>> futures; + // We then link additional persistent volumes. foreach (const Resource& resource, resources.persistentVolumes()) { // This is enforced by the master. @@ -183,45 +213,74 @@ Future<Nothing> PosixFilesystemIsolatorProcess::update( string original = paths::getPersistentVolumePath(flags.work_dir, resource); - bool isVolumeInUse = false; - - foreachpair (const ContainerID& _containerId, - const Owned<Info>& info, - infos) { - // Skip self. - if (_containerId == containerId) { - continue; - } - - if (info->resources.contains(resource)) { - isVolumeInUse = true; - break; - } - } - - // Set the ownership of the persistent volume to match that of the sandbox - // directory if the volume is not already in use. If the volume is - // currently in use by other containers, tasks in this container may fail - // to read from or write to the persistent volume due to incompatible - // ownership and file system permissions. - if (!isVolumeInUse) { - // TODO(hausdorff): (MESOS-5461) Persistent volumes maintain the invariant - // that they are used by one task at a time. This is currently enforced by - // `os::chown`. Windows does not support `os::chown`, we will need to - // revisit this later. -#ifndef __WINDOWS__ - LOG(INFO) << "Changing the ownership of the persistent volume at '" - << original << "' with uid " << uid << " and gid " << gid; - - Try<Nothing> chown = os::chown(uid, gid, original, false); - - if (chown.isError()) { - return Failure( - "Failed to change the ownership of the persistent volume at '" + - original + "' with uid " + stringify(uid) + - " and gid " + stringify(gid) + ": " + chown.error()); + // If the container's user is root (uid == 0), we do not need to do any + // changes about the volume's ownership since it has the full permissions + // to access the volume. + if (uid != 0) { + // For shared persistent volume, if volume gid manager is enabled, call + // volume gid manager to allocate a gid to make sure the container has + // the permission to access the volume. + // + // TODO(qianzhang): Support gid allocation for shared persistent volumes + // from resource providers. + if (resource.has_shared() && + !Resources::hasResourceProvider(resource) && + volumeGidManager) { + #ifndef __WINDOWS__ + LOG(INFO) << "Invoking volume gid manager to allocate gid to the " + << "volume path '" << original << "' for container " + << containerId; + + futures.push_back( + volumeGidManager->allocate(original, VolumeGidInfo::PERSISTENT)); + #endif // __WINDOWS__ + } else { + bool isVolumeInUse = false; + + // Check if the shared persistent volume is currently used by another + // container. We do not need to do this check for local persistent + // volume since it can only be used by one container at a time. + if (resource.has_shared()) { + foreachpair (const ContainerID& _containerId, + const Owned<Info>& info, + infos) { + // Skip self. + if (_containerId == containerId) { + continue; + } + + if (info->resources.contains(resource)) { + isVolumeInUse = true; + break; + } + } + } + + // Set the ownership of the persistent volume to match that of the + // sandbox directory if the volume is not already in use. If the + // volume is currently in use by other containers, tasks in this + // container may fail to read from or write to the persistent volume + // due to incompatible ownership and file system permissions. + if (!isVolumeInUse) { + // TODO(hausdorff): (MESOS-5461) Persistent volumes maintain the + // invariant that they are used by one task at a time. This is + // currently enforced by `os::chown`. Windows does not support + // `os::chown`, we will need to revisit this later. + #ifndef __WINDOWS__ + LOG(INFO) << "Changing the ownership of the persistent volume at '" + << original << "' with uid " << uid << " and gid " << gid; + + Try<Nothing> chown = os::chown(uid, gid, original, false); + + if (chown.isError()) { + return Failure( + "Failed to change the ownership of the persistent volume at '" + + original + "' with uid " + stringify(uid) + + " and gid " + stringify(gid) + ": " + chown.error()); + } + #endif // __WINDOWS__ + } } -#endif } string link = path::join(info->directory, containerPath); @@ -279,7 +338,17 @@ Future<Nothing> PosixFilesystemIsolatorProcess::update( // Store the updated resources. info->resources = resources; - return Nothing(); + return collect(futures) + .then(defer(self(), [this, containerId](const vector<gid_t>& gids) + -> Future<Nothing> { + if (!infos.contains(containerId)) { + return Failure("Unknown container"); + } + + infos[containerId]->gids = gids; + + return Nothing(); + })); } diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp index deacc90..9db6ddd 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp @@ -23,6 +23,8 @@ #include "slave/containerizer/mesos/isolator.hpp" +#include "slave/volume_gid_manager/volume_gid_manager.hpp" + namespace mesos { namespace internal { namespace slave { @@ -30,7 +32,9 @@ namespace slave { class PosixFilesystemIsolatorProcess : public MesosIsolatorProcess { public: - static Try<mesos::slave::Isolator*> create(const Flags& flags); + static Try<mesos::slave::Isolator*> create( + const Flags& flags, + VolumeGidManager* volumeGidManager); ~PosixFilesystemIsolatorProcess() override; @@ -50,9 +54,12 @@ public: const ContainerID& containerId) override; protected: - PosixFilesystemIsolatorProcess(const Flags& flags); + PosixFilesystemIsolatorProcess( + const Flags& flags, + VolumeGidManager* volumeGidManager); const Flags flags; + VolumeGidManager* volumeGidManager; struct Info { @@ -63,6 +70,8 @@ protected: // Track resources so we can unlink unneeded persistent volumes. Resources resources; + + std::vector<gid_t> gids; }; hashmap<ContainerID, process::Owned<Info>> infos; diff --git a/src/slave/containerizer/mesos/isolators/filesystem/windows.cpp b/src/slave/containerizer/mesos/isolators/filesystem/windows.cpp index f169c38..eb8d528 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/windows.cpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/windows.cpp @@ -29,14 +29,17 @@ namespace internal { namespace slave { WindowsFilesystemIsolatorProcess::WindowsFilesystemIsolatorProcess( - const Flags& _flags) + const Flags& _flags, + VolumeGidManager* _volumeGidManager) : ProcessBase(process::ID::generate("windows-filesystem-isolator")), - PosixFilesystemIsolatorProcess(_flags) {} + PosixFilesystemIsolatorProcess(_flags, _volumeGidManager) {} -Try<Isolator*> WindowsFilesystemIsolatorProcess::create(const Flags& flags) +Try<Isolator*> WindowsFilesystemIsolatorProcess::create( + const Flags& flags, + VolumeGidManager* volumeGidManager) { process::Owned<MesosIsolatorProcess> process( - new WindowsFilesystemIsolatorProcess(flags)); + new WindowsFilesystemIsolatorProcess(flags, volumeGidManager)); return new MesosIsolator(process); } diff --git a/src/slave/containerizer/mesos/isolators/filesystem/windows.hpp b/src/slave/containerizer/mesos/isolators/filesystem/windows.hpp index 2bf011d..aea0165 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/windows.hpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/windows.hpp @@ -22,8 +22,11 @@ #include "slave/flags.hpp" #include "slave/containerizer/mesos/isolator.hpp" + #include "slave/containerizer/mesos/isolators/filesystem/posix.hpp" +#include "slave/volume_gid_manager/volume_gid_manager.hpp" + namespace mesos { namespace internal { namespace slave { @@ -36,10 +39,14 @@ namespace slave { class WindowsFilesystemIsolatorProcess : public PosixFilesystemIsolatorProcess { public: - static Try<mesos::slave::Isolator*> create(const Flags& flags); + static Try<mesos::slave::Isolator*> create( + const Flags& flags, + VolumeGidManager* volumeGidManager); private: - WindowsFilesystemIsolatorProcess(const Flags& flags); + WindowsFilesystemIsolatorProcess( + const Flags& flags, + VolumeGidManager* volumeGidManager); }; } // namespace slave { diff --git a/src/slave/volume_gid_manager/volume_gid_manager.cpp b/src/slave/volume_gid_manager/volume_gid_manager.cpp index ed8f6a2..49f70a2 100644 --- a/src/slave/volume_gid_manager/volume_gid_manager.cpp +++ b/src/slave/volume_gid_manager/volume_gid_manager.cpp @@ -28,6 +28,10 @@ #include "common/values.hpp" +#ifdef __linux__ +#include "linux/fs.hpp" +#endif // __linux__ + #include "slave/volume_gid_manager/volume_gid_manager.hpp" using std::string; @@ -168,34 +172,67 @@ public: LOG(INFO) << "Use the allocated gid " << gid << " of the volume path '" << path << "'"; } else { - // Allocate a free gid to the specified path and then set the - // ownership for it. - if (freeGids.empty()) { - return Failure( - "Failed to allocate gid to the volume path '" + path + - "' because the free gid range is exhausted"); + Option<string> realPath; +#ifdef __linux__ + // This is to handle the case that nested containers use shared persistent + // volume, in which case we did a workaround in the default executor to + // set up a volume mapping (i.e., map the shared persistent volume to a + // SANDBOX_PATH volume of PARENT type for nested containers) so that the + // nested container can access the shared persistent volume. So here we + // need to search the mount table to see if the `path` is the target of a + // mount, and check if we have already allocated a gid to the source of + // that mount, if yes, then we just return that gid instead of allocating + // a new one, otherwise the ownership of the shared persistent volume will + // be wrongly overwritten. + Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); + if (table.isError()) { + return Failure("Failed to get mount table: " + table.error()); } - gid = freeGids.begin()->lower(); + foreach (const fs::MountInfoTable::Entry& entry, table->entries) { + if (path == entry.target) { + realPath = entry.root; + break; + } + } +#endif // __linux__ - LOG(INFO) << "Allocating gid " << gid << " to the volume path '" - << path << "'"; + if (realPath.isSome() && infos.contains(realPath.get())) { + gid = infos[realPath.get()].gid(); - Try<Nothing> result = setVolumeOwnership(path, gid, true); - if (result.isError()) { - return Failure( - "Failed to set the owner group of the volume path '" + path + - "' to " + stringify(gid) + ": " + result.error()); - } + LOG(INFO) << "Use the allocated gid " << gid << " of the volume path '" + << realPath.get() << "' which is mounted at the volume path '" + << path << "'"; + } else { + // Allocate a free gid to the specified path and then set the + // ownership for it. + if (freeGids.empty()) { + return Failure( + "Failed to allocate gid to the volume path '" + path + + "' because the free gid range is exhausted"); + } + + gid = freeGids.begin()->lower(); + + LOG(INFO) << "Allocating gid " << gid << " to the volume path '" + << path << "'"; - freeGids -= gid; + Try<Nothing> result = setVolumeOwnership(path, gid, true); + if (result.isError()) { + return Failure( + "Failed to set the owner group of the volume path '" + path + + "' to " + stringify(gid) + ": " + result.error()); + } + + freeGids -= gid; - VolumeGidInfo info; - info.set_type(type); - info.set_path(path); - info.set_gid(gid); + VolumeGidInfo info; + info.set_type(type); + info.set_path(path); + info.set_gid(gid); - infos.put(path, info); + infos.put(path, info); + } } return gid;
