bmahler commented on code in PR #519:
URL: https://github.com/apache/mesos/pull/519#discussion_r1528930282
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -59,6 +63,30 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AvailableSubsystems)
}
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ // Deny all access.
+ devices::Entry entry = devices::Entry::parse("a *:* rwm").get();
Review Comment:
Let's ASSERT_SOME on this rather than doing a naked .get(), later code can
use * instead of .get().
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -59,6 +63,30 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AvailableSubsystems)
}
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ // Deny all access.
+ devices::Entry entry = devices::Entry::parse("a *:* rwm").get();
+
+ EXPECT_SOME(devices::configure(cgroups2::ROOT_CGROUP, {}, {entry}));
+ Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(
+ cgroups2::ROOT_CGROUP);
Review Comment:
Is this modifying the root cgroup in a test? We don't want to do that since
that's going to mess with other things on the system that run in the root
cgroup? Instead we'd want to create a test cgroup and clean it up properly
afterwards.
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
Review Comment:
do you want this private or protected?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
Review Comment:
can you remove the unicode characters? maybe this works, but I don't think
we've added any non-ASCII to our current files.. have we?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
+
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = selector.type == Entry::Selector::Type::BLOCK
+ ? BPF_DEVCG_DEV_BLOCK
+ : BPF_DEVCG_DEV_CHAR;
+
+ program.append({ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size) });
+ --jmp_size;
+ }
+ // Check access (r3) against entry.
Review Comment:
newline between each new block of code here
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
+
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = selector.type == Entry::Selector::Type::BLOCK
+ ? BPF_DEVCG_DEV_BLOCK
+ : BPF_DEVCG_DEV_CHAR;
+
+ program.append({ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size) });
+ --jmp_size;
+ }
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG (BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM (BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG (BPF_JNE, BPF_REG_1, BPF_REG_3, (short) (jmp_size -
2))});
+ jmp_size -= 3;
+ }
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int) selector.major.get(),
jmp_size)});
Review Comment:
would be good to explain these instructions in general, e.g.
```
// Jump if major (r4) != selector.major.
```
Are we jumping *past* the first instruction of the allow / deny access block
below? If so, it seems like it makes sense to separate out the append of the
BPF_EXIT_INSN instruction and make it more clear in the comments here that
we're jumping directly to that?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
+
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = selector.type == Entry::Selector::Type::BLOCK
+ ? BPF_DEVCG_DEV_BLOCK
+ : BPF_DEVCG_DEV_CHAR;
+
+ program.append({ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, jmp_size) });
+ --jmp_size;
+ }
+ // Check access (r3) against entry.
+ if (check_access) {
+ int bpf_access = 0;
+ bpf_access |= access.read ? BPF_DEVCG_ACC_READ : 0;
+ bpf_access |= access.write ? BPF_DEVCG_ACC_WRITE : 0;
+ bpf_access |= access.mknod ? BPF_DEVCG_ACC_MKNOD : 0;
+
+ program.append({
+ BPF_MOV32_REG (BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM (BPF_AND, BPF_REG_1, bpf_access),
+ BPF_JMP_REG (BPF_JNE, BPF_REG_1, BPF_REG_3, (short) (jmp_size -
2))});
+ jmp_size -= 3;
+ }
+ // Check major version (r4) against entry.
+ if (check_major) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, (int) selector.major.get(),
jmp_size)});
+ --jmp_size;
+ }
+ // Check minor version (r5) against entry.
+ if (check_minor) {
+ program.append({
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, (int) selector.minor.get(),
jmp_size)});
+ --jmp_size;
+ }
+
+ // Allow/Deny access block.
+ program.append({
+ BPF_MOV64_IMM(BPF_REG_0, allow ? ALLOW_ACCESS : DENY_ACCESS),
+ BPF_EXIT_INSN()});
+
+ return Nothing();
+ }
+
+ ebpf::Program build()
+ {
+ // Exit instructions.
+ // If no entry granted access, then deny the access.
+ program.append({
+ BPF_MOV64_IMM (BPF_REG_0, DENY_ACCESS),
+ BPF_EXIT_INSN()});
+
+ return program;
Review Comment:
are we expecting our programs to reach these instructions..? if so, when?
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -59,6 +63,30 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AvailableSubsystems)
}
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ // Deny all access.
+ devices::Entry entry = devices::Entry::parse("a *:* rwm").get();
+
+ EXPECT_SOME(devices::configure(cgroups2::ROOT_CGROUP, {}, {entry}));
+ Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(
+ cgroups2::ROOT_CGROUP);
+
+ ASSERT_SOME(attached);
+ EXPECT_EQ(1, (int) attached->size());
+
+ // Verify that we can't open /dev/null.
+ EXPECT_ERROR(os::open(os::DEV_NULL, O_RDWR));
+
+ EXPECT_SOME(ebpf::cgroups2::detach(cgroups2::ROOT_CGROUP,
attached.get()[0]));
Review Comment:
Another reason we don't want to modify the root cgroup, if our test fails or
is killed, we'll leave our device program attached?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
Review Comment:
quite hard to understand what this is doing, can you add some explanation
that covers the byte format and the operations we're doing to extract the
values out into separate registers?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
Review Comment:
when using an initializer list {, you can indent by two, and close the } on
its own line (if a trailing comma on the last item works as well, add one):
```
BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8),
});
```
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
Review Comment:
Hm.. why would we check access before we know that we're looking at the
right device? i.e. that the device's type and major / minor match?
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -59,6 +63,30 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AvailableSubsystems)
}
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ // Deny all access.
+ devices::Entry entry = devices::Entry::parse("a *:* rwm").get();
+
+ EXPECT_SOME(devices::configure(cgroups2::ROOT_CGROUP, {}, {entry}));
+ Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(
+ cgroups2::ROOT_CGROUP);
+
+ ASSERT_SOME(attached);
+ EXPECT_EQ(1, (int) attached->size());
Review Comment:
did you actually need to cast the vector's size for this comparison? If so,
you can use `1u`.
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
Review Comment:
Move these inside DeviceProgram?
##########
src/tests/containerizer/cgroups2_tests.cpp:
##########
@@ -59,6 +63,30 @@ TEST_F(Cgroups2Test, ROOT_CGROUPS2_AvailableSubsystems)
}
}
+
+TEST_F(Cgroups2Test, ROOT_CGROUPS2_DeviceController)
+{
+ namespace devices = cgroups2::devices;
+
+ // Deny all access.
+ devices::Entry entry = devices::Entry::parse("a *:* rwm").get();
+
+ EXPECT_SOME(devices::configure(cgroups2::ROOT_CGROUP, {}, {entry}));
+ Try<vector<uint32_t>> attached = ebpf::cgroups2::attached(
+ cgroups2::ROOT_CGROUP);
+
+ ASSERT_SOME(attached);
+ EXPECT_EQ(1, (int) attached->size());
+
+ // Verify that we can't open /dev/null.
+ EXPECT_ERROR(os::open(os::DEV_NULL, O_RDWR));
+
+ EXPECT_SOME(ebpf::cgroups2::detach(cgroups2::ROOT_CGROUP,
attached.get()[0]));
Review Comment:
Prefer `*` or `->at(0)` over .get().
##########
src/linux/cgroups2.hpp:
##########
@@ -69,6 +71,20 @@ Try<std::set<std::string>> enabled(const std::string&
cgroup);
} // namespace controllers {
+namespace devices {
+
+using cgroups::devices::Entry;
+
+// Configure the device access permissions for the cgroup. These permissions
+// are hierarchical. I.e. if a parent cgroup does not allow an access then
+// 'this' cgroup will be denied access.
+Try<Nothing> configure(
+ const std::string& cgroup,
+ const std::vector<Entry> allow,
+ const std::vector<Entry> deny);
Review Comment:
We typically do not take a const non reference like this. The options are:
* const vector& (if we don't need to mutate it / store it)
* vector&& (if we want to force a move to avoid copying)
* vector (if we need to mutate it / store a mutated version of it, and want
to support both move and const ref / copy)
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
+
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
Review Comment:
where are these 3's and 1's coming from? can they be computed rather than
burned in?
edit: I see now based on what gets appended below
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
Review Comment:
What is the NEXT BLOCK?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
Review Comment:
Feel free to take const refs here to indicate we're not modifying these?
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
Review Comment:
What does this mean? This comment isn't clear to an outsider IMO.
edit: after reading the rest of the code I now get this, it seems to me like
this comment should be right on top of the check_ bools rather than one block
above it:
```
// The generated program only needs to perform checks when the selector
/ access
// are specified as follows:
bool check_type = selector.type != Entry::Selector::Type::ALL;
bool check_access = !access.mknod || !access.read || !access.write;
bool check_major = selector.major.isSome();
bool check_minor = selector.minor.isSome();
```
##########
src/linux/cgroups2.cpp:
##########
@@ -295,4 +296,152 @@ Try<set<string>> enabled(const string& cgroup)
} // namespace controllers {
+namespace devices {
+
+const int ALLOW_ACCESS = 1;
+
+const int DENY_ACCESS = 0;
+
+// Utility class to construct an eBPF program to whitelist or blacklist
+// select device accesses.
+class DeviceProgram
+{
+public:
+ DeviceProgram() : program{ebpf::Program(BPF_PROG_TYPE_CGROUP_DEVICE)}
+ {
+ program.append({
+ // r2: Type ('c', 'b', '?')
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+ // r3: Access ('r', 'w', 'm')
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+ // r4: Major Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 4),
+ // r5: Minor Version
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, 8)});
+ }
+
+ Try<Nothing> allow(const Entry entry) { return addDevice(entry, true); }
+
+ Try<Nothing> deny(const Entry entry) { return addDevice(entry, false); }
+
+ Try<Nothing> addDevice(const Entry entry, bool allow)
+ {
+ // We create a block of bytecode with the format:
+ // 1. [ Type Check ] ⤵
+ // 2. [ Access Check ] ⤵
+ // 3. [ Minor Version Check ] ⤵
+ // 4. [ Major Version Check ] ⤵
+ // 5. [ Allow/Deny Access ] ↓
+ // ↓
+ // 6. [ NEXT BLOCK ] ←←
+ //
+ // Either:
+ // 1. The device access is matched by (1,2,3,4) and the Allow/Deny access
+ // block (5) is executed.
+ // 2. One of (1,2,3,4) does not match the requested access and we skip
+ // to the next block (6).
+
+ // We only check attributes that are not 100% permissive:
+ // - Type != all
+ // - Access != rnw
+ // - Minor != None (AKA "any")
+ // - Major != None (AKA "any")
+ Entry::Selector selector = entry.selector;
+ Entry::Access access = entry.access;
+
+ bool check_type = selector.type != Entry::Selector::Type::ALL;
+ bool check_access = !access.mknod || !access.read || !access.write;
+ bool check_major = selector.major.isSome();
+ bool check_minor = selector.minor.isSome();
+
+ // Number of instructions to the [NEXT BLOCK]. This is used if a check
+ // fails (meaning this whitelist entry does not apply) and we want to
+ // skip the subsequent checks.
+ short jmp_size = 1 + (check_access ? 3 : 0) + (check_type ? 1 : 0) +
+ (check_major ? 1 : 0) + (check_minor ? 1 : 0);
+
+ // Check type (r2) against entry.
+ if (check_type) {
+ int bpf_type = selector.type == Entry::Selector::Type::BLOCK
+ ? BPF_DEVCG_DEV_BLOCK
+ : BPF_DEVCG_DEV_CHAR;
Review Comment:
can you use a switch on selector.type here instead with no default case? The
compiler will then fail if someone adds a new selector.type enum value but this
code isn't updated
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]