This is an automated email from the ASF dual-hosted git repository.
bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new 1b59ed8e2 [cgroups2] Update `destroy` to be async more robust.
1b59ed8e2 is described below
commit 1b59ed8e22c074abc9328b22ac7ee49b9a481a86
Author: Devin Leamy <[email protected]>
AuthorDate: Wed Apr 24 11:14:20 2024 -0400
[cgroups2] Update `destroy` to be async more robust.
We were running into an inconsistent issue with `cgroups2::destroy`.
`cgroups2::destroy` would fail with EBUSY when removing cgroups with
`rmdir`.
The error was being caused because some processes had not been killed
when `rmdir` was called on their cgroup; a cgroup with processes
cannot be destroyed.
After signalling a kill (by writing "1" to 'cgroup.kill') sometimes
processes were staying alive long enough to cause `rmdir` to fail.
Hence, we update `cgroups2::destroy` to wait after signalling a
SIGKILL for all the processes to drop before attempting to remove
the cgroups.
Since we wait a maximum of half a second, we don't want to block the
caller. Thus, we update `destroy` to be async.
---
src/linux/cgroups2.cpp | 71 +++++++++++++++-------
src/linux/cgroups2.hpp | 2 +-
.../mesos/isolators/cgroups2/cgroups2.cpp | 29 ++++++---
.../mesos/isolators/cgroups2/cgroups2.hpp | 2 +-
src/slave/containerizer/mesos/linux_launcher.cpp | 14 +----
src/tests/containerizer/cgroups2_tests.cpp | 4 +-
6 files changed, 79 insertions(+), 43 deletions(-)
diff --git a/src/linux/cgroups2.cpp b/src/linux/cgroups2.cpp
index ad282695e..782c7c417 100644
--- a/src/linux/cgroups2.cpp
+++ b/src/linux/cgroups2.cpp
@@ -28,6 +28,7 @@
#include <process/loop.hpp>
#include <process/pid.hpp>
+#include <stout/adaptor.hpp>
#include <stout/none.hpp>
#include <stout/numify.hpp>
#include <stout/os.hpp>
@@ -381,41 +382,69 @@ Try<Nothing> kill(const std::string& cgroup)
}
-Try<Nothing> destroy(const string& cgroup)
+Future<Nothing> destroy(const string& cgroup)
{
if (!cgroups2::exists(cgroup)) {
- return Error("Cgroup '" + cgroup + "' does not exist");
+ return Failure("Cgroup '" + cgroup + "' does not exist");
}
// To destroy a subtree of cgroups we first kill all of the processes inside
// of the cgroup and then remove all of the cgroup directories, removing
// the most deeply nested directories first.
+
Try<Nothing> kill = cgroups2::kill(cgroup);
if (kill.isError()) {
- return Error("Failed to kill processes in cgroup: " + kill.error());
+ return Failure("Failed to kill processes in cgroup: " + kill.error());
}
- Try<set<string>> cgroups = cgroups2::get(cgroup);
- if (cgroups.isError()) {
- return Error("Failed to get nested cgroups: " + cgroups.error());
- }
+ // Wait until all of the processes have been killed.
+ int retries = 50;
+ Future<Nothing> emptied = loop(
+ []() { return process::after(Milliseconds(1)); },
+ [=](const Nothing&) mutable -> Future<ControlFlow<Nothing>> {
+ Try<set<pid_t>> pids = cgroups2::processes(cgroup, true);
+ if (pids.isError()) {
+ return Failure("Failed to fetch pids in cgroup: " + pids.error());
+ }
- vector<string> sorted(
- std::make_move_iterator(cgroups->begin()),
- std::make_move_iterator(cgroups->end()));
- sorted.push_back(cgroup);
- std::sort(sorted.rbegin(), sorted.rend());
+ if (pids->empty()) {
+ return Break();
+ }
- foreach (const string& cgroup, sorted) {
- const string path = cgroups2::path(cgroup);
- Try<Nothing> rmdir = os::rmdir(path, false);
- if (rmdir.isError()) {
- return Error(
- "Failed to remove directory '" + path + "': " + rmdir.error());
- }
- }
+ --retries;
+ if (retries == 0) {
+ return Failure("Processes were still found: " + stringify(*pids));
+ }
- return Nothing();
+ return Continue();
+ });
+
+ return emptied
+ .then([=]() -> Future<Nothing> {
+ Try<set<string>> cgroups = cgroups2::get(cgroup);
+ if (cgroups.isError()) {
+ return Failure("Failed to get nested cgroups: " + cgroups.error());
+ }
+
+ cgroups->insert(cgroup);
+
+ // Remove the cgroups in bottom-up order.
+ foreach (const string& cgroup, adaptor::reverse(*cgroups)) {
+ const string path = cgroups2::path(cgroup);
+
+ // Remove the cgroup's directory. If the directory does not exist,
+ // ignore the error to protect against races.
+ if (::rmdir(path.c_str()) < 0) {
+ ErrnoError error = ErrnoError();
+ if (error.code != ENOENT) {
+ return Failure(
+ "Failed to remove directory '" + path + "': " + error.message);
+ }
+ }
+ }
+
+ return Nothing();
+ });
}
diff --git a/src/linux/cgroups2.hpp b/src/linux/cgroups2.hpp
index 2639e2f74..247fafd01 100644
--- a/src/linux/cgroups2.hpp
+++ b/src/linux/cgroups2.hpp
@@ -79,7 +79,7 @@ Try<Nothing> kill(const std::string& cgroup);
// Recursively destroy a cgroup and all nested cgroups. Processes inside of
// destroyed cgroups are killed with SIGKILL.
-Try<Nothing> destroy(const std::string& cgroup);
+process::Future<Nothing> destroy(const std::string& cgroup);
// Assign a process to a cgroup, by PID, removing the process from its
diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
index 3946b5f57..762e34380 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.cpp
@@ -751,13 +751,28 @@ Future<Nothing> Cgroups2IsolatorProcess::_cleanup(
+ strings::join(", ", errors));
}
- if (cgroups2::exists(infos[containerId]->cgroup)) {
- Try<Nothing> destroy = cgroups2::destroy(infos[containerId]->cgroup);
- if (destroy.isError()) {
- return Failure(
- "Failed to destroy cgroup '" + infos[containerId]->cgroup + "': "
- + destroy.error());
- }
+ if (!cgroups2::exists(infos[containerId]->cgroup)) {
+ infos.erase(containerId);
+ return Nothing();
+ }
+
+ return cgroups2::destroy(infos[containerId]->cgroup)
+ .then(defer(
+ PID<Cgroups2IsolatorProcess>(this),
+ &Cgroups2IsolatorProcess::__cleanup,
+ containerId,
+ lambda::_1));
+}
+
+
+Future<Nothing> Cgroups2IsolatorProcess::__cleanup(
+ const ContainerID& containerId,
+ const Future<Nothing>& future)
+{
+ if (future.isFailed()) {
+ return Failure(
+ "Failed to destroy cgroup '" + infos[containerId]->cgroup + "': "
+ + (future.isFailed() ? future.failure() : "discarded"));
}
infos.erase(containerId);
diff --git a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
index 211b9a1df..b2d06b490 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups2/cgroups2.hpp
@@ -149,7 +149,7 @@ private:
process::Future<Nothing> __cleanup(
const ContainerID& containerId,
- const std::vector<process::Future<Nothing>>& futures);
+ const process::Future<Nothing>& future);
process::Owned<Cgroups2IsolatorProcess::Info> cgroupInfo(
const ContainerID& containerId) const;
diff --git a/src/slave/containerizer/mesos/linux_launcher.cpp
b/src/slave/containerizer/mesos/linux_launcher.cpp
index 1ca8df6cc..cbc883bd1 100644
--- a/src/slave/containerizer/mesos/linux_launcher.cpp
+++ b/src/slave/containerizer/mesos/linux_launcher.cpp
@@ -112,7 +112,7 @@ private:
Future<Nothing> destroyCgroups(const Container& container);
Future<Nothing> _destroyCgroups(const Container& container);
- Try<Nothing> destroyCgroups2(const Container& container);
+ Future<Nothing> destroyCgroups2(const Container& container);
const Flags flags;
@@ -773,7 +773,7 @@ Future<Nothing> LinuxLauncherProcess::_destroyCgroups(
}
-Try<Nothing> LinuxLauncherProcess::destroyCgroups2(
+Future<Nothing> LinuxLauncherProcess::destroyCgroups2(
const Container& container)
{
#ifdef ENABLE_CGROUPS_V2
@@ -784,15 +784,7 @@ Try<Nothing> LinuxLauncherProcess::destroyCgroups2(
LOG(INFO) << "Destroying cgroup '" << cgroup << "'";
- Try<Nothing> destroy = cgroups2::destroy(cgroup);
- if (destroy.isError()) {
- return Error("Failed to destory cgroup '" + cgroup + "': "
- + destroy.error());
- }
-
- LOG(INFO) << "Destroyed container " << container.id;
-
- return Nothing();
+ return cgroups2::destroy(cgroup);
#else
return Error("cgroups2 is not enabled");
#endif // ENABLE_CGROUPS_V2
diff --git a/src/tests/containerizer/cgroups2_tests.cpp
b/src/tests/containerizer/cgroups2_tests.cpp
index 918dfe4ef..e36f99b64 100644
--- a/src/tests/containerizer/cgroups2_tests.cpp
+++ b/src/tests/containerizer/cgroups2_tests.cpp
@@ -95,14 +95,14 @@ protected:
// Cleanup the test cgroup, in case a previous test run didn't clean it
// up properly.
if (cgroups2::exists(TEST_CGROUP)) {
- ASSERT_SOME(cgroups2::destroy(TEST_CGROUP));
+ AWAIT_READY(cgroups2::destroy(TEST_CGROUP));
}
}
void TearDown() override
{
if (cgroups2::exists(TEST_CGROUP)) {
- ASSERT_SOME(cgroups2::destroy(TEST_CGROUP));
+ AWAIT_READY(cgroups2::destroy(TEST_CGROUP));
}
ASSERT_SOME(cgroups2::controllers::disable(