Added `linux/devices` isolator whitelist support.

Added `linux/devices` isolator support for populating the container
devices.  This introduces a general mechanism for populating devices
into a specific container but currently only implements devices for all
containers based on the devices specified by the `--allowed_devices`
agent flag.

Review: https://reviews.apache.org/r/67097/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ae413c9f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ae413c9f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ae413c9f

Branch: refs/heads/master
Commit: ae413c9f0496b24845bd868f740dfbb63ee6db4b
Parents: 377478d
Author: James Peach <[email protected]>
Authored: Fri May 25 13:38:06 2018 -0700
Committer: James Peach <[email protected]>
Committed: Fri May 25 13:38:06 2018 -0700

----------------------------------------------------------------------
 .../mesos/isolators/linux/devices.cpp           | 167 ++++++++++++++++++-
 .../mesos/isolators/linux/devices.hpp           |  23 ++-
 2 files changed, 182 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ae413c9f/src/slave/containerizer/mesos/isolators/linux/devices.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/linux/devices.cpp 
b/src/slave/containerizer/mesos/isolators/linux/devices.cpp
index 35ed008..cc2fd90 100644
--- a/src/slave/containerizer/mesos/isolators/linux/devices.cpp
+++ b/src/slave/containerizer/mesos/isolators/linux/devices.cpp
@@ -14,31 +14,116 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "slave/containerizer/mesos/isolators/linux/devices.hpp"
 
+#include <sys/mount.h>
+
+#include <process/id.hpp>
+
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/strings.hpp>
+
+#include <stout/os/posix/chown.hpp>
+
+#include "slave/containerizer/mesos/paths.hpp"
+
 using process::Failure;
 using process::Future;
 using process::Owned;
 
 using mesos::slave::ContainerConfig;
 using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerMountInfo;
 using mesos::slave::Isolator;
 
+using std::string;
+
 namespace mesos {
 namespace internal {
 namespace slave {
 
 Try<Isolator*> LinuxDevicesIsolatorProcess::create(const Flags& flags)
 {
-  if (geteuid() != 0) {
+  if (::geteuid() != 0) {
     return Error("Linux devices isolator requires root permissions");
   }
 
-  return new MesosIsolator(
-      Owned<MesosIsolatorProcess>(new LinuxDevicesIsolatorProcess(flags)));
+  if (flags.launcher != "linux") {
+    return Error("'linux' launcher must be used");
+  }
+
+  if (!strings::contains(flags.isolation, "filesystem/linux")) {
+    return Error("'filesystem/linux' isolator must be used");
+  }
+
+  hashmap<string, Device> whitelistedDevices;
+
+  if (flags.allowed_devices.isSome()) {
+    foreach (const DeviceAccess& deviceAccess,
+             flags.allowed_devices->allowed_devices()) {
+      // TODO(jpeach) The `cgroups/devices` isolator silently ignores devices
+      // that are whitelisted with no path. We should do the same if we want
+      // to be consistent, but silently ignoring configuration is not the
+      // right policy, so perhaps we should fix `cgroups/devices` instead.
+      if (!deviceAccess.device().has_path()) {
+        return Error("Whitelisted device has no device path provided");
+      }
+
+      const string& path = deviceAccess.device().path();
+
+      Try<dev_t> rdev = os::stat::rdev(path);
+      if (rdev.isError()) {
+        return Error("Failed to obtain device ID for '" + path +
+                     "': " + rdev.error());
+      }
+
+      Try<mode_t> mode = os::stat::mode(path);
+      if (mode.isError()) {
+        return Error("Failed to obtain device mode for '" + path +
+                     "': " + mode.error());
+      }
+
+      Device dev = {rdev.get(), S_IRUSR | S_IWUSR };
+
+      if (S_ISBLK(mode.get())) {
+        dev.mode |= S_IFBLK;
+      } else if (S_ISCHR(mode.get())) {
+        dev.mode |= S_IFCHR;
+      } else {
+        return Error("'" + path + "' is not a block or character device");
+      }
+
+      // Set the desired access for the device. Access is controlled at
+      // container granularity, which is consistent with the devices cgroup
+      // policy. This means that if we populate a read-write device into a
+      // container, then every process in that container should have access,
+      // regardless of the credential of that process.
+
+      if (deviceAccess.access().read()) {
+        dev.mode |= (S_IRGRP | S_IROTH);
+      }
+
+      if (deviceAccess.access().write()) {
+        dev.mode |= (S_IWGRP | S_IWOTH);
+      }
+
+      whitelistedDevices.put(
+          strings::remove(path, "/dev/", strings::PREFIX), dev);
+    }
+  }
+
+  return new MesosIsolator(Owned<MesosIsolatorProcess>(
+      new LinuxDevicesIsolatorProcess(flags.runtime_dir, whitelistedDevices)));
 }
 
+LinuxDevicesIsolatorProcess::LinuxDevicesIsolatorProcess(
+    const string& _runtimeDirectory,
+    const hashmap<string, Device>& _whitelistedDevices)
+  : ProcessBase(process::ID::generate("linux-devices-isolator")),
+    runtimeDirectory(_runtimeDirectory),
+    whitelistedDevices(_whitelistedDevices) {}
+
 
 bool LinuxDevicesIsolatorProcess::supportsNesting()
 {
@@ -56,7 +141,81 @@ Future<Option<ContainerLaunchInfo>> 
LinuxDevicesIsolatorProcess::prepare(
     const ContainerID& containerId,
     const ContainerConfig& containerConfig)
 {
+  // If there's no rootfs, we won't be building a private `/dev`
+  // so there's nothing to do.
+  if (!containerConfig.has_rootfs()) {
+    return None();
+  }
+
+  if (whitelistedDevices.empty()) {
     return None();
+  }
+
+  ContainerLaunchInfo launchInfo;
+
+  const string devicesDir = containerizer::paths::getContainerDevicesPath(
+      runtimeDirectory, containerId);
+
+  Try<Nothing> mkdir = os::mkdir(devicesDir);
+  if (mkdir.isError()) {
+    return Failure(
+        "Failed to create container devices directory: " + mkdir.error());
+  }
+
+  Try<Nothing> chmod = os::chmod(devicesDir, 0700);
+  if (chmod.isError()) {
+    return Failure("Failed to set container devices directory permissions: " +
+                   chmod.error());
+  }
+
+  // We need to restrict access to the devices directory so that all
+  // processes on the system don't get access to devices that we make
+  // read-write. This means that we have to chown to ensure that the
+  // container user still has access.
+  if (containerConfig.has_user()) {
+    Try<Nothing> chown = os::chown(containerConfig.user(), devicesDir);
+    if (chown.isError()) {
+      return Failure(
+          "Failed to set '" + containerConfig.user() + "' "
+          "as the container devices directory owner: " + chown.error());
+    }
+  }
+
+  // Import the whitelisted devices to all containers.
+  foreachpair (const string& path, const Device& dev, whitelistedDevices) {
+    const string devicePath = path::join(devicesDir, path);
+
+    Try<Nothing> mkdir = os::mkdir(Path(devicePath).dirname());
+    if (mkdir.isError()) {
+      return Failure(
+          "Failed to create parent directory for device '" +
+          devicePath + "': " + mkdir.error());
+    }
+
+    Try<Nothing> mknod = os::mknod(devicePath, dev.mode, dev.dev);
+    if (mknod.isError()) {
+      return Failure(
+          "Failed to create device '" + devicePath + "': " + mknod.error());
+    }
+
+    // We have to chmod the device to make sure that the umask doesn't filter
+    // the permissions defined by the whitelist.
+    Try<Nothing> chmod = os::chmod(devicePath, dev.mode & ~S_IFMT);
+    if (chmod.isError()) {
+      return Failure(
+          "Failed to chmod device '" + devicePath + "': " + chmod.error());
+    }
+
+    ContainerMountInfo* mount = launchInfo.add_mounts();
+    mount->set_source(devicePath);
+    mount->set_target(path::join(containerConfig.rootfs(), "dev", path));
+    mount->set_flags(MS_BIND);
+  }
+
+  // TODO(jpeach) Define Task API to let schedulers specify the container
+  // devices and automatically populate the right devices cgroup entries.
+
+  return launchInfo;
 }
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/ae413c9f/src/slave/containerizer/mesos/isolators/linux/devices.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/linux/devices.hpp 
b/src/slave/containerizer/mesos/isolators/linux/devices.hpp
index e731ef3..58128f0 100644
--- a/src/slave/containerizer/mesos/isolators/linux/devices.hpp
+++ b/src/slave/containerizer/mesos/isolators/linux/devices.hpp
@@ -17,6 +17,11 @@
 #ifndef __LINUX_DEVICES_ISOLATOR_HPP__
 #define __LINUX_DEVICES_ISOLATOR_HPP__
 
+#include <sys/types.h>
+
+#include <string>
+
+#include <stout/hashmap.hpp>
 #include <stout/try.hpp>
 
 #include "slave/flags.hpp"
@@ -32,15 +37,25 @@ class LinuxDevicesIsolatorProcess : public 
MesosIsolatorProcess
 public:
   static Try<mesos::slave::Isolator*> create(const Flags& flags);
 
-  virtual bool supportsNesting();
-  virtual bool supportsStandalone();
+  virtual bool supportsNesting() override;
+  virtual bool supportsStandalone() override;
 
   virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
       const ContainerID& containerId,
-      const mesos::slave::ContainerConfig& containerConfig);
+      const mesos::slave::ContainerConfig& containerConfig) override;
 
 private:
-  LinuxDevicesIsolatorProcess(const Flags& _flags);
+  struct Device {
+    dev_t dev;
+    mode_t mode;
+  };
+
+  const std::string runtimeDirectory;
+  const hashmap<std::string, Device> whitelistedDevices;
+
+  LinuxDevicesIsolatorProcess(
+      const std::string& runtimeDirectory,
+      const hashmap<std::string, Device>& whitelistedDevices);
 };
 
 } // namespace slave {

Reply via email to