This is an automated email from the ASF dual-hosted git repository. jpeach pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit f7efc95c04f9a36372bf7f6028be5be50625204f Author: James Peach <[email protected]> AuthorDate: Thu May 23 19:09:09 2019 -0700 Add containerizer support for masking paths. Add support to the `filesystem/linux` isolator for masking container paths. Add a set of standard default paths to be masked, as derived from commonly used container runtimes. These paths either expose information about other system processes, or capabilities that should not be exposed to untrusted containers. We don't mask if the container is privileged, which is defined as sharing the host's PID namespace. For nested containers, we verify that the PID namespace is shared from the host all the way up the tree. Review: https://reviews.apache.org/r/70678/ --- include/mesos/slave/containerizer.proto | 5 ++ .../mesos/isolators/filesystem/linux.cpp | 67 ++++++++++++++++++++++ src/slave/containerizer/mesos/launch.cpp | 28 +++++++++ .../linux_filesystem_isolator_tests.cpp | 56 ++++++++++++++++++ src/tests/containerizer/rootfs.cpp | 1 + 5 files changed, 157 insertions(+) diff --git a/include/mesos/slave/containerizer.proto b/include/mesos/slave/containerizer.proto index 48ffa2e..e992448 100644 --- a/include/mesos/slave/containerizer.proto +++ b/include/mesos/slave/containerizer.proto @@ -243,6 +243,11 @@ message ContainerLaunchInfo { // this list will be mounted in order. repeated ContainerMountInfo mounts = 17; + // The set of paths to mask in the container. Files are masked by mounting + // `/dev/null` and directories are masked by mounting a zero-sized `tmpfs. + // Paths are masked after all other mounts are made. + repeated string masked_paths = 21; + // (Linux only) The Seccomp profile for the container. // The profile is used to configure syscall filtering via `libseccomp`. optional seccomp.ContainerSeccompProfile seccomp_profile = 18; diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp index 725754f..190054c 100644 --- a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp +++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp @@ -154,6 +154,21 @@ static const ContainerMountInfo ROOTFS_CONTAINER_MOUNTS[] = { }; +static const vector<string> ROOTFS_MASKED_PATHS = { + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/key-users", + "/proc/latency_stats", + "/proc/sched_debug", + "/proc/scsi", + "/proc/timer_list", + "/proc/timer_stats", + "/sys/firmware", +}; + + static Try<Nothing> makeStandardDevices( const string& devicesDir, const string& rootDir, @@ -451,6 +466,44 @@ static Try<Nothing> ensureAllowDevices(const string& _targetDir) } +// We define a container is privileged if it is sharing the PID +// namespace with the host. For nested containers, we walk up +// the tree and verify it is shared all the way up to the root. +static Try<bool> isPrivilegedContainer( + const string runtimeDir, + const ContainerID& containerId, + const ContainerConfig& containerConfig) +{ + if (!containerConfig.container_info().linux_info().share_pid_namespace()) { + return false; + } + + CHECK(containerConfig.container_info().linux_info().share_pid_namespace()); + + // If we are a root container, we are privileged because we share + // the host's PID namespace. + if (!containerId.has_parent()) { + return true; + } + + // If we are a nested container, we have to walk up the container tree. + ContainerID parentId = containerId.parent(); + Result<ContainerConfig> parentConfig = + containerizer::paths::getContainerConfig(runtimeDir, parentId); + + if (parentConfig.isNone()) { + return Error( + "Failed to find config for parent container " + stringify(parentId)); + } + + if (parentConfig.isError()) { + return Error(parentConfig.error()); + } + + return isPrivilegedContainer(runtimeDir, parentId, parentConfig.get()); +} + + Try<Isolator*> LinuxFilesystemIsolatorProcess::create( const Flags& flags, VolumeGidManager* volumeGidManager) @@ -744,6 +797,20 @@ Future<Option<ContainerLaunchInfo>> LinuxFilesystemIsolatorProcess::prepare( *launchInfo.add_mounts() = createContainerMount( containerConfig.directory(), sandbox, MS_BIND | MS_REC); + + Try<bool> privileged = + isPrivilegedContainer(flags.runtime_dir, containerId, containerConfig); + if (privileged.isError()) { + return Failure(privileged.error()); + } + + // Apply container path masking for non-privileged containers. + if (!privileged.get()) { + foreach (const string& path, ROOTFS_MASKED_PATHS) { + launchInfo.add_masked_paths( + path::join(containerConfig.rootfs(), path)); + } + } } // Currently, we only need to update resources for top level containers. diff --git a/src/slave/containerizer/mesos/launch.cpp b/src/slave/containerizer/mesos/launch.cpp index 88b97a5..5ddb4c7 100644 --- a/src/slave/containerizer/mesos/launch.cpp +++ b/src/slave/containerizer/mesos/launch.cpp @@ -452,6 +452,26 @@ static Try<Nothing> prepareMounts(const ContainerLaunchInfo& launchInfo) } +static Try<Nothing> maskPath(const string& target) +{ + Try<Nothing> mnt = Nothing(); + +#ifdef __linux__ + if (os::stat::isfile(target)) { + mnt = fs::mount("/dev/null", target, None(), MS_BIND | MS_RDONLY, None()); + } else if (os::stat::isdir(target)) { + mnt = fs::mount(None(), target, "tmpfs", MS_RDONLY, "size=0"); + } +#endif // __linux__ + + if (mnt.isError()) { + return Error("Failed to mask '" + target + "': " + mnt.error()); + } + + return Nothing(); +} + + static Try<Nothing> installResourceLimits(const RLimitInfo& limits) { #ifdef __WINDOWS__ @@ -744,6 +764,14 @@ int MesosContainerizerLaunch::execute() exitWithStatus(EXIT_FAILURE); } + foreach (const string& target, launchInfo.masked_paths()) { + mount = maskPath(target); + if (mount.isError()) { + cerr << "Failed to mask container paths: " << mount.error() << endl; + exitWithStatus(EXIT_FAILURE); + } + } + // Run additional preparation commands. These are run as the same // user and with the environment as the agent. foreach (const CommandInfo& command, launchInfo.pre_exec_commands()) { diff --git a/src/tests/containerizer/linux_filesystem_isolator_tests.cpp b/src/tests/containerizer/linux_filesystem_isolator_tests.cpp index 60e9ae5..2390902 100644 --- a/src/tests/containerizer/linux_filesystem_isolator_tests.cpp +++ b/src/tests/containerizer/linux_filesystem_isolator_tests.cpp @@ -195,6 +195,62 @@ TEST_F(LinuxFilesystemIsolatorTest, ROOT_PseudoDevicesWithRootFilesystem) } +// This test verifies that paths can be masked in the container's +// root filesystem. +TEST_F(LinuxFilesystemIsolatorTest, ROOT_MaskedPathsWithRootFilesystem) +{ + AWAIT_READY(DockerArchive::create(GetRegistryPath(), "test_image")); + + slave::Flags flags = CreateSlaveFlags(); + + Fetcher fetcher(flags); + + Try<MesosContainerizer*> create = + MesosContainerizer::create(flags, true, &fetcher); + + ASSERT_SOME(create); + + Owned<Containerizer> containerizer(create.get()); + + ContainerID containerId; + containerId.set_value(id::UUID::random().toString()); + + ExecutorInfo executor = createExecutorInfo( + "test_executor", + "set -x;" + // /proc/keys should be a char special because we masked it. + "test -c /proc/keys || exit 1;" + "test -s /proc/keys && exit 1;" + // /proc/scsi/scsi should not exist since we masked /proc/scsi. + "test -d /proc/scsi/scsi && exit 1;" + // Verify masked paths are read-only. + "mkdir /proc/scsi/foo && exit 1;" + "dd if=/dev/zero of=/proc/keys count=1;" + "test -c /proc/keys || exit 1;" + "exit 0"); + + executor.mutable_container()->CopyFrom(createContainerInfo("test_image")); + + string directory = path::join(flags.work_dir, "sandbox"); + ASSERT_SOME(os::mkdir(directory)); + + Future<Containerizer::LaunchResult> launch = containerizer->launch( + containerId, + createContainerConfig(None(), executor, directory), + map<string, string>(), + None()); + + AWAIT_ASSERT_EQ(Containerizer::LaunchResult::SUCCESS, launch); + + Future<Option<ContainerTermination>> wait = containerizer->wait(containerId); + + AWAIT_READY(wait); + ASSERT_SOME(wait.get()); + ASSERT_TRUE(wait->get().has_status()); + EXPECT_WEXITSTATUS_EQ(0, wait->get().status()); +} + + // This test verifies that the metrics about the number of executors // that have root filesystem specified is correctly reported. TEST_F(LinuxFilesystemIsolatorTest, ROOT_Metrics) diff --git a/src/tests/containerizer/rootfs.cpp b/src/tests/containerizer/rootfs.cpp index 48eb010..206cab6 100644 --- a/src/tests/containerizer/rootfs.cpp +++ b/src/tests/containerizer/rootfs.cpp @@ -131,6 +131,7 @@ Try<process::Owned<Rootfs>> LinuxRootfs::create(const string& root) "/bin/dd", "/bin/echo", "/bin/ls", + "/bin/mkdir", "/bin/ping", "/bin/sh", "/bin/sleep",
