bmahler commented on code in PR #519:
URL: https://github.com/apache/mesos/pull/519#discussion_r1532862960
##########
src/linux/cgroups2.hpp:
##########
@@ -75,6 +77,10 @@ Try<Nothing> move_process(const std::string& cgroup, pid_t
pid);
// /sys/fs/cgroup. E.g. For /sys/fs/cgroup/test, this will return "test".
Try<std::string> cgroup(pid_t pid);
+
+// Get the absolute of a cgroup. Assumes to cgroup exists.
+std::string path(const std::string& cgroup);
Review Comment:
maybe add this in a different PR and replace all the existing path joins
with this?
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = 0;
+
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: {
+ bpf_type = BPF_DEVCG_DEV_BLOCK;
+ break;
+ }
+ case Entry::Selector::Type::CHARACTER: {
+ bpf_type = BPF_DEVCG_DEV_CHAR;
+ break;
+ }
+ case Entry::Selector::Type::ALL: {
+ // Won't be reached, since `check_type` is true.
+ break;
+ }
+ }
+
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, (short)(jmp_size - 2)),
+ });
+ jmp_size -= 3;
+ }
+
+ if (!check_major && !check_minor && !check_type && !check_access) {
+ // Every device entry will either be allowed or denied.
Review Comment:
// The exit instructions as well as any additional device entries would
// generate unreachable blocks.
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = 0;
+
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: {
+ bpf_type = BPF_DEVCG_DEV_BLOCK;
+ break;
+ }
+ case Entry::Selector::Type::CHARACTER: {
+ bpf_type = BPF_DEVCG_DEV_CHAR;
+ break;
+ }
+ case Entry::Selector::Type::ALL: {
+ // Won't be reached, since `check_type` is true.
+ break;
+ }
+ }
+
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, (short)(jmp_size - 2)),
+ });
+ jmp_size -= 3;
+ }
+
+ if (!check_major && !check_minor && !check_type && !check_access) {
+ // Every device entry will either be allowed or denied.
+ hasCatchAll = true;
+ }
+
+ // Allow/Deny access block.
+ program.append({
+ BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+
+ return Nothing();
+ }
+
+ ebpf::Program build()
+ {
+ if (!hasCatchAll) {
+ // Exit instructions.
+ // If no entry granted access, then deny the access.
+ program.append({
+ BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+ }
+ return program;
+ }
+
+ ebpf::Program program;
+
+ // True if the program has a device entry that allows or denies ALL accesses.
+ // Such cases need to be specially handled because any instructions added
+ // after it will be unreachable, and thus will cause the eBPF verifier to
+ // reject the program.
+ bool hasCatchAll = false;
+
+ static const int ALLOW_ACCESS = 1;
+
+ static const int DENY_ACCESS = 0;
+};
+
+
+Try<Nothing> configure(
+ const string& cgroup,
+ const vector<Entry>& allow,
+ const vector<Entry>& deny)
+{
+ DeviceProgram program = DeviceProgram();
+ foreach (const Entry entry, allow) {
+ program.allow(entry);
+ }
+ foreach (const Entry entry, deny) {
+ program.deny(entry);
+ }
+
+ Try<Nothing> attach =
+ ebpf::cgroups2::attach(cgroups2::path(cgroup), program.build());
+ if (attach.isError()) {
+ return Error(
+ "Failed to attach Device Controller program: " + attach.error());
Review Comment:
maybe "cgroup device program" or BPF_PROG_TYPE_CGROUP_DEVICE?
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = 0;
+
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: {
+ bpf_type = BPF_DEVCG_DEV_BLOCK;
+ break;
+ }
+ case Entry::Selector::Type::CHARACTER: {
+ bpf_type = BPF_DEVCG_DEV_CHAR;
+ break;
+ }
+ case Entry::Selector::Type::ALL: {
+ // Won't be reached, since `check_type` is true.
+ break;
+ }
+ }
Review Comment:
```suggestion
int bpf_type = [selector]() {
switch (selector.type) {
case Entry::Selector::Type::BLOCK: return BPF_DEVCG_DEV_BLOCK;
case Entry::Selector::Type::CHARACTER: return BPF_DEVCG_DEV_CHAR;
case Entry::Selector::Type::ALL: UNREACHABLE();
}
}();
```
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -115,6 +119,48 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AssignProcessToCgroup)
AWAIT_EXPECT_WTERMSIG_EQ(SIGKILL, process::reap(pid));
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ ASSERT_SOME(cgroups2::create(TEST_CGROUP));
+ string absolutePath = cgroups2::path(TEST_CGROUP);
+
+ // Deny all access.
+ Try<devices::Entry> entry = devices::Entry::parse("c *:* rwm");
+ ASSERT_SOME(entry);
Review Comment:
would be good to get more coverage of the logic here, rather than only
testing a single deny all character devices?
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
Review Comment:
seems clear in the code already?
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = 0;
+
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: {
+ bpf_type = BPF_DEVCG_DEV_BLOCK;
+ break;
+ }
+ case Entry::Selector::Type::CHARACTER: {
+ bpf_type = BPF_DEVCG_DEV_CHAR;
+ break;
+ }
+ case Entry::Selector::Type::ALL: {
+ // Won't be reached, since `check_type` is true.
+ break;
+ }
+ }
+
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, (short)(jmp_size - 2)),
Review Comment:
static_cast
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
+ // We create a block of bytecode with the format:
+ // 1. Major Version Check
+ // 2. Minor Version Check
+ // 3. Type Check
+ // 4. Access Check
+ // 5. Allow/Deny Access
+ //
+ // 6. NEXT BLOCK
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ const Entry::Selector& selector = entry.selector;
+ const Entry::Access& access = entry.access;
+
+ // We only check attributes that are not 100% permissive:
+ // - Major != None (AKA "any")
+ // - Minor != None (AKA "any")
+ // - Type != all
+ // - Access != rnw
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int)selector.major.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int)selector.minor.get(), jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = 0;
+
+ switch (selector.type) {
+ case Entry::Selector::Type::BLOCK: {
+ bpf_type = BPF_DEVCG_DEV_BLOCK;
+ break;
+ }
+ case Entry::Selector::Type::CHARACTER: {
+ bpf_type = BPF_DEVCG_DEV_CHAR;
+ break;
+ }
+ case Entry::Selector::Type::ALL: {
+ // Won't be reached, since `check_type` is true.
+ break;
+ }
+ }
+
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size),
+ });
+ --jmp_size;
+ }
+
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, (short)(jmp_size - 2)),
+ });
+ jmp_size -= 3;
+ }
+
+ if (!check_major && !check_minor && !check_type && !check_access) {
+ // Every device entry will either be allowed or denied.
+ hasCatchAll = true;
+ }
+
+ // Allow/Deny access block.
+ program.append({
+ BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+
+ return Nothing();
+ }
+
+ ebpf::Program build()
+ {
+ if (!hasCatchAll) {
+ // Exit instructions.
+ // If no entry granted access, then deny the access.
+ program.append({
+ BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+ BPF_EXIT_INSN(),
+ });
+ }
+ return program;
+ }
+
+ ebpf::Program program;
+
+ // True if the program has a device entry that allows or denies ALL accesses.
+ // Such cases need to be specially handled because any instructions added
+ // after it will be unreachable, and thus will cause the eBPF verifier to
+ // reject the program.
+ bool hasCatchAll = false;
+
+ static const int ALLOW_ACCESS = 1;
+
+ static const int DENY_ACCESS = 0;
+};
+
+
+Try<Nothing> configure(
+ const string& cgroup,
+ const vector<Entry>& allow,
+ const vector<Entry>& deny)
Review Comment:
4 spaces here?
##########
src/linux/cgroups2.cpp:
##########
@@ -442,4 +449,204 @@ Try<uint64_t> weight(const string& cgroup)
} // namespace cpu {
+namespace devices {
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // The BPF_PROG_TYPE_CGROUP_DEVICE program takes in
+ // `struct bpf_cgroup_dev_ctx*` as input. We extract the fields into
+ // registers r2-5.
+ //
+ // The device type is encoded in the first 16 bits of `access_type` and
+ // the access type is encoded in the last 16 bits of `access_type`.
+ //
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(
+ BPF_W, BPF_REG_2, BPF_REG_1, offsetof(bpf_cgroup_dev_ctx,
access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, major)),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(bpf_cgroup_dev_ctx, minor)),
+ });
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ if (hasCatchAll) {
+ return Nothing();
+ }
+ // return Nothing();
Review Comment:
replace this with newline?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]