Added `linux/devices` isolator whitelist support. Added `linux/devices` isolator support for populating the container devices. This introduces a general mechanism for populating devices into a specific container but currently only implements devices for all containers based on the devices specified by the `--allowed_devices` agent flag.
Review: https://reviews.apache.org/r/67097/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ae413c9f Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ae413c9f Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ae413c9f Branch: refs/heads/master Commit: ae413c9f0496b24845bd868f740dfbb63ee6db4b Parents: 377478d Author: James Peach <[email protected]> Authored: Fri May 25 13:38:06 2018 -0700 Committer: James Peach <[email protected]> Committed: Fri May 25 13:38:06 2018 -0700 ---------------------------------------------------------------------- .../mesos/isolators/linux/devices.cpp | 167 ++++++++++++++++++- .../mesos/isolators/linux/devices.hpp | 23 ++- 2 files changed, 182 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/ae413c9f/src/slave/containerizer/mesos/isolators/linux/devices.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/isolators/linux/devices.cpp b/src/slave/containerizer/mesos/isolators/linux/devices.cpp index 35ed008..cc2fd90 100644 --- a/src/slave/containerizer/mesos/isolators/linux/devices.cpp +++ b/src/slave/containerizer/mesos/isolators/linux/devices.cpp @@ -14,31 +14,116 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "slave/containerizer/mesos/isolators/linux/devices.hpp" +#include <sys/mount.h> + +#include <process/id.hpp> + +#include <stout/os.hpp> +#include <stout/path.hpp> +#include <stout/strings.hpp> + +#include <stout/os/posix/chown.hpp> + +#include "slave/containerizer/mesos/paths.hpp" + using process::Failure; using process::Future; using process::Owned; using mesos::slave::ContainerConfig; using mesos::slave::ContainerLaunchInfo; +using mesos::slave::ContainerMountInfo; using mesos::slave::Isolator; +using std::string; + namespace mesos { namespace internal { namespace slave { Try<Isolator*> LinuxDevicesIsolatorProcess::create(const Flags& flags) { - if (geteuid() != 0) { + if (::geteuid() != 0) { return Error("Linux devices isolator requires root permissions"); } - return new MesosIsolator( - Owned<MesosIsolatorProcess>(new LinuxDevicesIsolatorProcess(flags))); + if (flags.launcher != "linux") { + return Error("'linux' launcher must be used"); + } + + if (!strings::contains(flags.isolation, "filesystem/linux")) { + return Error("'filesystem/linux' isolator must be used"); + } + + hashmap<string, Device> whitelistedDevices; + + if (flags.allowed_devices.isSome()) { + foreach (const DeviceAccess& deviceAccess, + flags.allowed_devices->allowed_devices()) { + // TODO(jpeach) The `cgroups/devices` isolator silently ignores devices + // that are whitelisted with no path. We should do the same if we want + // to be consistent, but silently ignoring configuration is not the + // right policy, so perhaps we should fix `cgroups/devices` instead. + if (!deviceAccess.device().has_path()) { + return Error("Whitelisted device has no device path provided"); + } + + const string& path = deviceAccess.device().path(); + + Try<dev_t> rdev = os::stat::rdev(path); + if (rdev.isError()) { + return Error("Failed to obtain device ID for '" + path + + "': " + rdev.error()); + } + + Try<mode_t> mode = os::stat::mode(path); + if (mode.isError()) { + return Error("Failed to obtain device mode for '" + path + + "': " + mode.error()); + } + + Device dev = {rdev.get(), S_IRUSR | S_IWUSR }; + + if (S_ISBLK(mode.get())) { + dev.mode |= S_IFBLK; + } else if (S_ISCHR(mode.get())) { + dev.mode |= S_IFCHR; + } else { + return Error("'" + path + "' is not a block or character device"); + } + + // Set the desired access for the device. Access is controlled at + // container granularity, which is consistent with the devices cgroup + // policy. This means that if we populate a read-write device into a + // container, then every process in that container should have access, + // regardless of the credential of that process. + + if (deviceAccess.access().read()) { + dev.mode |= (S_IRGRP | S_IROTH); + } + + if (deviceAccess.access().write()) { + dev.mode |= (S_IWGRP | S_IWOTH); + } + + whitelistedDevices.put( + strings::remove(path, "/dev/", strings::PREFIX), dev); + } + } + + return new MesosIsolator(Owned<MesosIsolatorProcess>( + new LinuxDevicesIsolatorProcess(flags.runtime_dir, whitelistedDevices))); } +LinuxDevicesIsolatorProcess::LinuxDevicesIsolatorProcess( + const string& _runtimeDirectory, + const hashmap<string, Device>& _whitelistedDevices) + : ProcessBase(process::ID::generate("linux-devices-isolator")), + runtimeDirectory(_runtimeDirectory), + whitelistedDevices(_whitelistedDevices) {} + bool LinuxDevicesIsolatorProcess::supportsNesting() { @@ -56,7 +141,81 @@ Future<Option<ContainerLaunchInfo>> LinuxDevicesIsolatorProcess::prepare( const ContainerID& containerId, const ContainerConfig& containerConfig) { + // If there's no rootfs, we won't be building a private `/dev` + // so there's nothing to do. + if (!containerConfig.has_rootfs()) { + return None(); + } + + if (whitelistedDevices.empty()) { return None(); + } + + ContainerLaunchInfo launchInfo; + + const string devicesDir = containerizer::paths::getContainerDevicesPath( + runtimeDirectory, containerId); + + Try<Nothing> mkdir = os::mkdir(devicesDir); + if (mkdir.isError()) { + return Failure( + "Failed to create container devices directory: " + mkdir.error()); + } + + Try<Nothing> chmod = os::chmod(devicesDir, 0700); + if (chmod.isError()) { + return Failure("Failed to set container devices directory permissions: " + + chmod.error()); + } + + // We need to restrict access to the devices directory so that all + // processes on the system don't get access to devices that we make + // read-write. This means that we have to chown to ensure that the + // container user still has access. + if (containerConfig.has_user()) { + Try<Nothing> chown = os::chown(containerConfig.user(), devicesDir); + if (chown.isError()) { + return Failure( + "Failed to set '" + containerConfig.user() + "' " + "as the container devices directory owner: " + chown.error()); + } + } + + // Import the whitelisted devices to all containers. + foreachpair (const string& path, const Device& dev, whitelistedDevices) { + const string devicePath = path::join(devicesDir, path); + + Try<Nothing> mkdir = os::mkdir(Path(devicePath).dirname()); + if (mkdir.isError()) { + return Failure( + "Failed to create parent directory for device '" + + devicePath + "': " + mkdir.error()); + } + + Try<Nothing> mknod = os::mknod(devicePath, dev.mode, dev.dev); + if (mknod.isError()) { + return Failure( + "Failed to create device '" + devicePath + "': " + mknod.error()); + } + + // We have to chmod the device to make sure that the umask doesn't filter + // the permissions defined by the whitelist. + Try<Nothing> chmod = os::chmod(devicePath, dev.mode & ~S_IFMT); + if (chmod.isError()) { + return Failure( + "Failed to chmod device '" + devicePath + "': " + chmod.error()); + } + + ContainerMountInfo* mount = launchInfo.add_mounts(); + mount->set_source(devicePath); + mount->set_target(path::join(containerConfig.rootfs(), "dev", path)); + mount->set_flags(MS_BIND); + } + + // TODO(jpeach) Define Task API to let schedulers specify the container + // devices and automatically populate the right devices cgroup entries. + + return launchInfo; } } // namespace slave { http://git-wip-us.apache.org/repos/asf/mesos/blob/ae413c9f/src/slave/containerizer/mesos/isolators/linux/devices.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/mesos/isolators/linux/devices.hpp b/src/slave/containerizer/mesos/isolators/linux/devices.hpp index e731ef3..58128f0 100644 --- a/src/slave/containerizer/mesos/isolators/linux/devices.hpp +++ b/src/slave/containerizer/mesos/isolators/linux/devices.hpp @@ -17,6 +17,11 @@ #ifndef __LINUX_DEVICES_ISOLATOR_HPP__ #define __LINUX_DEVICES_ISOLATOR_HPP__ +#include <sys/types.h> + +#include <string> + +#include <stout/hashmap.hpp> #include <stout/try.hpp> #include "slave/flags.hpp" @@ -32,15 +37,25 @@ class LinuxDevicesIsolatorProcess : public MesosIsolatorProcess public: static Try<mesos::slave::Isolator*> create(const Flags& flags); - virtual bool supportsNesting(); - virtual bool supportsStandalone(); + virtual bool supportsNesting() override; + virtual bool supportsStandalone() override; virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare( const ContainerID& containerId, - const mesos::slave::ContainerConfig& containerConfig); + const mesos::slave::ContainerConfig& containerConfig) override; private: - LinuxDevicesIsolatorProcess(const Flags& _flags); + struct Device { + dev_t dev; + mode_t mode; + }; + + const std::string runtimeDirectory; + const hashmap<std::string, Device> whitelistedDevices; + + LinuxDevicesIsolatorProcess( + const std::string& runtimeDirectory, + const hashmap<std::string, Device>& whitelistedDevices); }; } // namespace slave {
