This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 9d5468d1987 [branch-2.1](memory) BE memory info compatible with
CgroupV2 (#39799)
9d5468d1987 is described below
commit 9d5468d1987051fc8c078738986ab2eccfe4dbb8
Author: Xinyi Zou <[email protected]>
AuthorDate: Fri Aug 23 02:03:00 2024 +0800
[branch-2.1](memory) BE memory info compatible with CgroupV2 (#39799)
pick #39256
---
be/src/common/cgroup_memory_ctl.cpp | 196 +++++++++++++++++++++++
be/src/common/cgroup_memory_ctl.h | 45 ++++++
be/src/common/status.h | 4 +-
be/src/util/cgroup_util.cpp | 228 ++++++++++++++-------------
be/src/util/cgroup_util.h | 82 +++++++---
be/src/util/mem_info.cpp | 61 ++-----
be/test/util/cgroup_util_test.cpp | 49 +++++-
be/test/util/test_data/memory.limit_in_bytes | 1 +
be/test/util/test_data/memory.stat | 36 +++++
9 files changed, 512 insertions(+), 190 deletions(-)
diff --git a/be/src/common/cgroup_memory_ctl.cpp
b/be/src/common/cgroup_memory_ctl.cpp
new file mode 100644
index 00000000000..a29432bdb4e
--- /dev/null
+++ b/be/src/common/cgroup_memory_ctl.cpp
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CgroupsMemoryUsageObserver.cpp
+// and modified by Doris
+
+#include "common/cgroup_memory_ctl.h"
+
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <utility>
+
+#include "common/status.h"
+#include "util/cgroup_util.h"
+
+namespace doris {
+
+// Is the memory controller of cgroups v2 enabled on the system?
+// Assumes that cgroupsv2_enable() is enabled.
+Status cgroupsv2_memory_controller_enabled(bool* ret) {
+#if defined(OS_LINUX)
+ if (!CGroupUtil::cgroupsv2_enable()) {
+ return Status::CgroupError("cgroupsv2_enable is false");
+ }
+ // According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file
"cgroup.controllers" defines which controllers are available
+ // for the current + child cgroups. The set of available controllers can
be restricted from level to level using file
+ // "cgroups.subtree_control". It is therefore sufficient to check the
bottom-most nested "cgroup.controllers" file.
+ std::string cgroup = CGroupUtil::cgroupv2_of_process();
+ auto cgroup_dir = cgroup.empty() ? default_cgroups_mount :
(default_cgroups_mount / cgroup);
+ std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
+ if (!controllers_file.is_open()) {
+ *ret = false;
+ return Status::CgroupError("open cgroup.controllers failed");
+ }
+ std::string controllers;
+ std::getline(controllers_file, controllers);
+ *ret = controllers.find("memory") != std::string::npos;
+ return Status::OK();
+#else
+ *ret = false;
+ return Status::CgroupError("cgroupsv2 only support Linux");
+#endif
+}
+
+struct CgroupsV1Reader : CGroupMemoryCtl::ICgroupsReader {
+ explicit CgroupsV1Reader(std::filesystem::path mount_file_dir)
+ : _mount_file_dir(std::move(mount_file_dir)) {}
+
+ Status read_memory_limit(int64_t* value) override {
+ RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(
+ (_mount_file_dir / "memory.limit_in_bytes"), value));
+ return Status::OK();
+ }
+
+ Status read_memory_usage(int64_t* value) override {
+ std::unordered_map<std::string, int64_t> metrics_map;
+ CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir /
"memory.stat"),
+ metrics_map);
+ *value = metrics_map["rss"];
+ return Status::OK();
+ }
+
+private:
+ std::filesystem::path _mount_file_dir;
+};
+
+struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
+ explicit CgroupsV2Reader(std::filesystem::path mount_file_dir)
+ : _mount_file_dir(std::move(mount_file_dir)) {}
+
+ Status read_memory_limit(int64_t* value) override {
+
RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir /
"memory.max"),
+ value));
+ return Status::OK();
+ }
+
+ Status read_memory_usage(int64_t* value) override {
+ // memory.current contains a single number
+ // the reason why we subtract it described here:
https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667
+ RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(
+ (_mount_file_dir / "memory.current"), value));
+ std::unordered_map<std::string, int64_t> metrics_map;
+ CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir /
"memory.stat"),
+ metrics_map);
+ if (*value < metrics_map["inactive_file"]) {
+ return Status::CgroupError("CgroupsV2Reader read_memory_usage
negative memory usage");
+ }
+ *value -= metrics_map["inactive_file"];
+ return Status::OK();
+ }
+
+private:
+ std::filesystem::path _mount_file_dir;
+};
+
+std::pair<std::string, CGroupUtil::CgroupsVersion> get_cgroups_path() {
+ bool enable_controller;
+ auto cgroupsv2_memory_controller_st =
cgroupsv2_memory_controller_enabled(&enable_controller);
+ if (CGroupUtil::cgroupsv2_enable() && cgroupsv2_memory_controller_st.ok()
&&
+ enable_controller) {
+ auto v2_memory_stat_path =
CGroupUtil::get_cgroupsv2_path("memory.stat");
+ auto v2_memory_current_path =
CGroupUtil::get_cgroupsv2_path("memory.current");
+ auto v2_memory_max_path = CGroupUtil::get_cgroupsv2_path("memory.max");
+ if (v2_memory_stat_path.has_value() &&
v2_memory_current_path.has_value() &&
+ v2_memory_max_path.has_value() && v2_memory_stat_path ==
v2_memory_current_path &&
+ v2_memory_current_path == v2_memory_max_path) {
+ return {*v2_memory_stat_path, CGroupUtil::CgroupsVersion::V2};
+ }
+ }
+
+ std::string cgroup_path;
+ auto st = CGroupUtil::find_abs_cgroupv1_path("memory", &cgroup_path);
+ if (st.ok()) {
+ return {cgroup_path, CGroupUtil::CgroupsVersion::V1};
+ }
+
+ return {"", CGroupUtil::CgroupsVersion::V1};
+}
+
+Status get_cgroups_reader(std::shared_ptr<CGroupMemoryCtl::ICgroupsReader>&
reader) {
+ const auto [cgroup_path, version] = get_cgroups_path();
+ if (cgroup_path.empty()) {
+ bool enable_controller;
+ auto st = cgroupsv2_memory_controller_enabled(&enable_controller);
+ return Status::CgroupError(
+ "Cannot find cgroups v1 or v2 current memory file,
cgroupsv2_enable: {},{}, "
+ "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable:
{}",
+ CGroupUtil::cgroupsv2_enable(), enable_controller,
st.to_string(),
+ CGroupUtil::cgroupsv1_enable());
+ }
+
+ if (version == CGroupUtil::CgroupsVersion::V2) {
+ reader = std::make_shared<CgroupsV2Reader>(cgroup_path);
+ } else {
+ reader = std::make_shared<CgroupsV1Reader>(cgroup_path);
+ }
+ return Status::OK();
+}
+
+Status CGroupMemoryCtl::find_cgroup_mem_limit(int64_t* bytes) {
+ std::shared_ptr<CGroupMemoryCtl::ICgroupsReader> reader;
+ RETURN_IF_ERROR(get_cgroups_reader(reader));
+ RETURN_IF_ERROR(reader->read_memory_limit(bytes));
+ return Status::OK();
+}
+
+Status CGroupMemoryCtl::find_cgroup_mem_usage(int64_t* bytes) {
+ std::shared_ptr<CGroupMemoryCtl::ICgroupsReader> reader;
+ RETURN_IF_ERROR(get_cgroups_reader(reader));
+ RETURN_IF_ERROR(reader->read_memory_usage(bytes));
+ return Status::OK();
+}
+
+std::string CGroupMemoryCtl::debug_string() {
+ const auto [cgroup_path, version] = get_cgroups_path();
+ if (cgroup_path.empty()) {
+ bool enable_controller;
+ auto st = cgroupsv2_memory_controller_enabled(&enable_controller);
+ return fmt::format(
+ "Cannot find cgroups v1 or v2 current memory file,
cgroupsv2_enable: {},{}, "
+ "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable:
{}",
+ CGroupUtil::cgroupsv2_enable(), enable_controller,
st.to_string(),
+ CGroupUtil::cgroupsv1_enable());
+ }
+
+ int64_t mem_limit;
+ auto mem_limit_st = find_cgroup_mem_limit(&mem_limit);
+
+ int64_t mem_usage;
+ auto mem_usage_st = find_cgroup_mem_usage(&mem_usage);
+
+ return fmt::format(
+ "Process CGroup Memory Info (cgroups path: {}, cgroup version:
{}): memory limit: "
+ "{}, "
+ "memory usage: {}",
+ cgroup_path, (version == CGroupUtil::CgroupsVersion::V1) ? "v1" :
"v2",
+ mem_limit_st.ok() ? std::to_string(mem_limit) :
mem_limit_st.to_string(),
+ mem_usage_st.ok() ? std::to_string(mem_usage) :
mem_usage_st.to_string());
+}
+
+} // namespace doris
diff --git a/be/src/common/cgroup_memory_ctl.h
b/be/src/common/cgroup_memory_ctl.h
new file mode 100644
index 00000000000..83f33e03cda
--- /dev/null
+++ b/be/src/common/cgroup_memory_ctl.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/status.h"
+
+namespace doris {
+
+class CGroupMemoryCtl {
+public:
+ // Inherited by cgroup v1 and v2
+ struct ICgroupsReader {
+ virtual ~ICgroupsReader() = default;
+
+ virtual Status read_memory_limit(int64_t* value) = 0;
+
+ virtual Status read_memory_usage(int64_t* value) = 0;
+ };
+
+ // Determines the CGroup memory limit from the current processes' cgroup.
+ // If the limit is more than INT64_MAX, INT64_MAX is returned (since that
is
+ // effectively unlimited anyway). Does not take into account memory limits
+ // set on any ancestor CGroups.
+ static Status find_cgroup_mem_limit(int64_t* bytes);
+
+ //
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
+ static Status find_cgroup_mem_usage(int64_t* bytes);
+
+ // Returns a human-readable string with information about CGroups.
+ static std::string debug_string();
+};
+} // namespace doris
diff --git a/be/src/common/status.h b/be/src/common/status.h
index 94988cbdae3..8847bb7c087 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -281,7 +281,8 @@ namespace ErrorCode {
E(INVERTED_INDEX_ANALYZER_ERROR, -6011, false); \
E(KEY_NOT_FOUND, -7000, false); \
E(KEY_ALREADY_EXISTS, -7001, false); \
- E(ENTRY_NOT_FOUND, -7002, false);
+ E(ENTRY_NOT_FOUND, -7002, false); \
+ E(CGROUP_ERROR, -7411, false);
// Define constexpr int error_code_name = error_code_value
#define M(NAME, ERRORCODE, ENABLESTACKTRACE) constexpr int NAME = ERRORCODE;
@@ -469,6 +470,7 @@ public:
ERROR_CTOR(NotAuthorized, NOT_AUTHORIZED)
ERROR_CTOR(HttpError, HTTP_ERROR)
ERROR_CTOR(NeedSendAgain, NEED_SEND_AGAIN)
+ ERROR_CTOR(CgroupError, CGROUP_ERROR)
#undef ERROR_CTOR
template <int code>
diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp
index 9ad78696a6f..8109e38559f 100644
--- a/be/src/util/cgroup_util.cpp
+++ b/be/src/util/cgroup_util.cpp
@@ -18,10 +18,7 @@
#include "util/cgroup_util.h"
#include <algorithm>
-#include <cfloat>
#include <fstream>
-#include <iomanip>
-#include <memory>
#include <utility>
#include <vector>
@@ -40,14 +37,33 @@ using std::pair;
namespace doris {
-Status CGroupUtil::find_global_cgroup(const string& subsystem, string* path) {
+bool CGroupUtil::cgroupsv1_enable() {
+ bool exists = true;
+ Status st = io::global_local_filesystem()->exists("/proc/cgroups",
&exists);
+ return st.ok() && exists;
+}
+
+bool CGroupUtil::cgroupsv2_enable() {
+#if defined(OS_LINUX)
+ // This file exists iff the host has cgroups v2 enabled.
+ auto controllers_file = default_cgroups_mount / "cgroup.controllers";
+ bool exists = true;
+ Status st = io::global_local_filesystem()->exists(controllers_file,
&exists);
+ return st.ok() && exists;
+#else
+ return false;
+#endif
+}
+
+Status CGroupUtil::find_global_cgroupv1(const string& subsystem, string* path)
{
std::ifstream proc_cgroups("/proc/self/cgroup", std::ios::in);
string line;
while (true) {
if (proc_cgroups.fail()) {
- return Status::IOError("Error reading /proc/self/cgroup: {}",
get_str_err_msg());
+ return Status::CgroupError("Error reading /proc/self/cgroup: {}",
get_str_err_msg());
} else if (proc_cgroups.peek() == std::ifstream::traits_type::eof()) {
- return Status::NotFound("Could not find subsystem {} in
/proc/self/cgroup", subsystem);
+ return Status::CgroupError("Could not find subsystem {} in
/proc/self/cgroup",
+ subsystem);
}
// The line format looks like this:
// 4:memory:/user.slice
@@ -82,32 +98,15 @@ static Status unescape_path(const string& escaped, string*
unescaped) {
return Status::OK();
}
-static Status read_cgroup_value(const string& limit_file_path, int64_t* val) {
- std::ifstream limit_file(limit_file_path, std::ios::in);
- string line;
- getline(limit_file, line);
- if (limit_file.fail() || limit_file.bad()) {
- return Status::IOError("Error reading {}: {}", limit_file_path,
get_str_err_msg());
- }
- StringParser::ParseResult pr;
- // Parse into an int64_t If it overflows, returning the max value of
int64_t is ok because that
- // is effectively unlimited.
- *val = StringParser::string_to_int<int64_t>(line.c_str(), line.size(),
&pr);
- if ((pr != StringParser::PARSE_SUCCESS && pr !=
StringParser::PARSE_OVERFLOW)) {
- return Status::InvalidArgument("Failed to parse {} as int64: '{}'",
limit_file_path, line);
- }
- return Status::OK();
-}
-
-Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair<string,
string>* result) {
+Status CGroupUtil::find_cgroupv1_mounts(const string& subsystem, pair<string,
string>* result) {
std::ifstream mountinfo("/proc/self/mountinfo", std::ios::in);
string line;
while (true) {
if (mountinfo.fail() || mountinfo.bad()) {
- return Status::IOError("Error reading /proc/self/mountinfo: {}",
get_str_err_msg());
+ return Status::CgroupError("Error reading /proc/self/mountinfo:
{}", get_str_err_msg());
} else if (mountinfo.eof()) {
- return Status::NotFound("Could not find subsystem {} in
/proc/self/mountinfo",
- subsystem);
+ return Status::CgroupError("Could not find subsystem {} in
/proc/self/mountinfo",
+ subsystem);
}
// The relevant lines look like below (see proc manpage for full
documentation). The
// first example is running outside of a container, the second example
is running
@@ -118,14 +117,18 @@ Status CGroupUtil::find_cgroup_mounts(const string&
subsystem, pair<string, stri
// 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
// ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup
rw,memory
getline(mountinfo, line);
- if (!mountinfo.good()) continue;
+ if (!mountinfo.good()) {
+ continue;
+ }
std::vector<string> fields = Split(line, " ", SkipWhitespace());
if (fields.size() < 7) {
return Status::InvalidArgument(
"Could not parse line from /proc/self/mountinfo - had {} >
7 tokens: '{}'",
fields.size(), line);
}
- if (fields[fields.size() - 3] != "cgroup") continue;
+ if (fields[fields.size() - 3] != "cgroup") {
+ continue;
+ }
// This is a cgroup mount. Check if it's the mount we're looking for.
std::vector<string> cgroup_opts = Split(fields[fields.size() - 1],
",", SkipWhitespace());
auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
@@ -138,16 +141,21 @@ Status CGroupUtil::find_cgroup_mounts(const string&
subsystem, pair<string, stri
RETURN_IF_ERROR(unescape_path(fields[3], &system_path));
// Strip trailing "/" so that both returned paths match in whether
they have a
// trailing "/".
- if (system_path[system_path.size() - 1] == '/') system_path.pop_back();
+ if (system_path[system_path.size() - 1] == '/') {
+ system_path.pop_back();
+ }
*result = {mount_path, system_path};
return Status::OK();
}
}
-Status CGroupUtil::find_abs_cgroup_path(const string& subsystem, string* path)
{
- RETURN_IF_ERROR(find_global_cgroup(subsystem, path));
+Status CGroupUtil::find_abs_cgroupv1_path(const string& subsystem, string*
path) {
+ if (!cgroupsv1_enable()) {
+ return Status::InvalidArgument("cgroup is not enabled!");
+ }
+ RETURN_IF_ERROR(find_global_cgroupv1(subsystem, path));
pair<string, string> paths;
- RETURN_IF_ERROR(find_cgroup_mounts(subsystem, &paths));
+ RETURN_IF_ERROR(find_cgroupv1_mounts(subsystem, &paths));
const string& mount_path = paths.first;
const string& system_path = paths.second;
if (path->compare(0, system_path.size(), system_path) != 0) {
@@ -158,98 +166,98 @@ Status CGroupUtil::find_abs_cgroup_path(const string&
subsystem, string* path) {
return Status::OK();
}
-Status CGroupUtil::find_cgroup_mem_limit(int64_t* bytes) {
- if (!enable()) {
- return Status::InvalidArgument("cgroup is not enabled!");
+std::string CGroupUtil::cgroupv2_of_process() {
+#if defined(OS_LINUX)
+ if (!cgroupsv2_enable()) {
+ return "";
+ }
+ // All PIDs assigned to a cgroup are in
/sys/fs/cgroups/{cgroup_name}/cgroup.procs
+ // A simpler way to get the membership is:
+ std::ifstream cgroup_name_file("/proc/self/cgroup");
+ if (!cgroup_name_file.is_open()) {
+ return "";
+ }
+ // With cgroups v2, there will be a *single* line with prefix "0::/"
+ // (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
+ std::string cgroup;
+ std::getline(cgroup_name_file, cgroup);
+ static const std::string v2_prefix = "0::/";
+ if (!cgroup.starts_with(v2_prefix)) {
+ return "";
}
- string cgroup_path;
- RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
- string limit_file_path = cgroup_path + "/memory.limit_in_bytes";
- return read_cgroup_value(limit_file_path, bytes);
+ cgroup = cgroup.substr(v2_prefix.length());
+ return cgroup;
+#else
+ return "";
+#endif
}
-Status CGroupUtil::find_cgroup_mem_usage(int64_t* bytes) {
- if (!enable()) {
- return Status::InvalidArgument("cgroup is not enabled!");
+std::optional<std::string> CGroupUtil::get_cgroupsv2_path(const std::string&
subsystem) {
+ if (!CGroupUtil::cgroupsv2_enable()) {
+ return {};
}
- string cgroup_path;
- RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
- string usage_file_path = cgroup_path + "/memory.usage_in_bytes";
- return read_cgroup_value(usage_file_path, bytes);
-}
-Status CGroupUtil::find_cgroup_mem_info(std::string* file_path) {
- if (!enable()) {
- return Status::InvalidArgument("cgroup is not enabled!");
+ std::string cgroup = CGroupUtil::cgroupv2_of_process();
+ auto current_cgroup = cgroup.empty() ? default_cgroups_mount :
(default_cgroups_mount / cgroup);
+
+ // Return the bottom-most nested current memory file. If there is no such
file at the current
+ // level, try again at the parent level as memory settings are inherited.
+ while (current_cgroup != default_cgroups_mount.parent_path()) {
+ if (std::filesystem::exists(current_cgroup / subsystem)) {
+ return {current_cgroup};
+ }
+ current_cgroup = current_cgroup.parent_path();
}
- string cgroup_path;
- RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
- *file_path = cgroup_path + "/memory.stat";
- return Status::OK();
+ return {};
}
-Status CGroupUtil::find_cgroup_cpu_limit(float* cpu_count) {
- if (!enable()) {
- return Status::InvalidArgument("cgroup is not enabled!");
- }
- int64_t quota;
- int64_t period;
- string cgroup_path;
- if (!find_abs_cgroup_path("cpu", &cgroup_path).ok()) {
- RETURN_IF_ERROR(find_abs_cgroup_path("cpuacct", &cgroup_path));
- }
- string cfs_quota_filename = cgroup_path + "/cpu.cfs_quota_us";
- RETURN_IF_ERROR(read_cgroup_value(cfs_quota_filename, "a));
- if (quota <= 0) {
- *cpu_count = -1;
- return Status::OK();
- }
- string cfs_period_filename = cgroup_path + "/cpu.cfs_period_us";
- RETURN_IF_ERROR(read_cgroup_value(cfs_period_filename, &period));
- if (quota <= period) {
- return Status::InvalidArgument("quota <= period");
+Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path&
file_path,
+ int64_t* val) {
+ std::ifstream file_stream(file_path, std::ios::in);
+ string line;
+ getline(file_stream, line);
+ if (file_stream.fail() || file_stream.bad()) {
+ return Status::CgroupError("Error reading {}: {}", file_path.string(),
get_str_err_msg());
}
- *cpu_count = float(quota) / float(period);
- if (*cpu_count >= FLT_MAX) {
- return Status::InvalidArgument("unknown");
+ StringParser::ParseResult pr;
+ // Parse into an int64_t If it overflows, returning the max value of
int64_t is ok because that
+ // is effectively unlimited.
+ *val = StringParser::string_to_int<int64_t>(line.c_str(), line.size(),
&pr);
+ if ((pr != StringParser::PARSE_SUCCESS && pr !=
StringParser::PARSE_OVERFLOW)) {
+ return Status::InvalidArgument("Failed to parse {} as int64: '{}'",
file_path.string(),
+ line);
}
return Status::OK();
}
-std::string CGroupUtil::debug_string() {
- if (!enable()) {
- return std::string("cgroup is not enabled!");
- }
- string mem_limit_str;
- int64_t mem_limit;
- Status status = find_cgroup_mem_limit(&mem_limit);
- if (status.ok()) {
- mem_limit_str = strings::Substitute("$0", mem_limit);
- } else {
- mem_limit_str = status.to_string();
- }
- string cpu_limit_str;
- float cpu_limit;
- status = find_cgroup_cpu_limit(&cpu_limit);
- if (status.ok()) {
- if (cpu_limit > 0) {
- std::stringstream stream;
- stream << std::fixed << std::setprecision(1) << cpu_limit;
- cpu_limit_str = stream.str();
- } else {
- cpu_limit_str = "unlimited";
+void CGroupUtil::read_int_metric_from_cgroup_file(
+ const std::filesystem::path& file_path,
+ std::unordered_map<std::string, int64_t>& metrics_map) {
+ std::ifstream cgroup_file(file_path, std::ios::in);
+ std::string line;
+ while (cgroup_file.good() && !cgroup_file.eof()) {
+ getline(cgroup_file, line);
+ std::vector<std::string> fields = strings::Split(line, " ",
strings::SkipWhitespace());
+ if (fields.size() < 2) {
+ continue;
}
- } else {
- cpu_limit_str = status.to_string();
- }
- return strings::Substitute("Process CGroup Info: memory.limit_in_bytes=$0,
cpu cfs limits: $1",
- mem_limit_str, cpu_limit_str);
-}
+ std::string key = fields[0].substr(0, fields[0].size());
-bool CGroupUtil::enable() {
- bool exists = true;
- Status st = io::global_local_filesystem()->exists("/proc/cgroups",
&exists);
- return st.ok() && exists;
+ StringParser::ParseResult result;
+ auto value =
+ StringParser::string_to_int<int64_t>(fields[1].data(),
fields[1].size(), &result);
+
+ if (result == StringParser::PARSE_SUCCESS) {
+ if (fields.size() == 2) {
+ metrics_map[key] = value;
+ } else if (fields[2] == "kB") {
+ metrics_map[key] = value * 1024L;
+ }
+ }
+ }
+ if (cgroup_file.is_open()) {
+ cgroup_file.close();
+ }
}
} // namespace doris
diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h
index 2152720ccdd..cf922ba5063 100644
--- a/be/src/util/cgroup_util.h
+++ b/be/src/util/cgroup_util.h
@@ -18,50 +18,88 @@
#pragma once
#include <cstdint>
+#include <filesystem>
+#include <optional>
#include <string>
#include <utility>
#include "common/status.h"
namespace doris {
-class CGroupUtil {
-public:
- // Determines the CGroup memory limit from the current processes' cgroup.
- // If the limit is more than INT64_MAX, INT64_MAX is returned (since that
is
- // effectively unlimited anyway). Does not take into account memory limits
- // set on any ancestor CGroups.
- static Status find_cgroup_mem_limit(int64_t* bytes);
-
- // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff)
- //
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
- static Status find_cgroup_mem_usage(int64_t* bytes);
- static Status find_cgroup_mem_info(std::string* file_path);
- // Determines the CGroup cpu cores limit from the current processes'
cgroup.
- static Status find_cgroup_cpu_limit(float* cpu_count);
+#if defined(OS_LINUX)
+// I think it is possible to mount the cgroups hierarchy somewhere else (e.g.
when in containers).
+// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I
have seen.
+static inline const std::filesystem::path default_cgroups_mount =
"/sys/fs/cgroup";
+#endif
- // Returns a human-readable string with information about CGroups.
- static std::string debug_string();
+/* Cgroup debugging steps
+ * CgroupV1:
+ * sudo cgcreate -t username:username -g memory:test
+ * sudo sh -c "echo 6000M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes"
+ * // process started by the current terminal will join Cgroup test
+ * sudo sh -c "echo $$ >> /sys/fs/cgroup/memory/test/cgroup.procs"
+ *
+ * CgroupV2:
+ * sudo mkdir /sys/fs/cgroup/test
+ * sudo echo 3000M > /sys/fs/cgroup/test/memory.max
+ * // process started by the current terminal will join Cgroup test
+ * sudo sh -c "echo $$ >> /sys/fs/cgroup/test/cgroup.procs"
+ * or
+ * // only memory allocated after joining the Cgroup is counted in
`memory.current`.
+ * sudo echo pid > /sys/fs/cgroup/test/cgroup.procs
+*/
+class CGroupUtil {
+public:
+ enum class CgroupsVersion : uint8_t { V1, V2 };
// detect if cgroup is enabled
- static bool enable();
+ static bool cgroupsv1_enable();
+ static bool cgroupsv2_enable();
-private:
// return the global cgroup path of subsystem like 12:memory:/user.slice
-> user.slice
- static Status find_global_cgroup(const std::string& subsystem,
std::string* path);
+ static Status find_global_cgroupv1(const std::string& subsystem,
std::string* path);
// Returns the absolute path to the CGroup from inside the container.
// E.g. if this process belongs to
// /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id>, which is
mounted at
// /sys/fs/cgroup/memory inside the container, this function returns
// "/sys/fs/cgroup/memory".
- static Status find_abs_cgroup_path(const std::string& subsystem,
std::string* path);
+ static Status find_abs_cgroupv1_path(const std::string& subsystem,
std::string* path);
// Figures out the mapping of the cgroup root from the container's point
of view to
// the full path relative to the system-wide cgroups outside of the
container.
// E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id> may
be mounted at
// /sys/fs/cgroup/memory inside the container. In that case this function
would return
// ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-<long unique id>").
- static Status find_cgroup_mounts(const std::string& subsystem,
- std::pair<std::string, std::string>*
result);
+ static Status find_cgroupv1_mounts(const std::string& subsystem,
+ std::pair<std::string, std::string>*
result);
+
+ // Which cgroup does the process belong to?
+ // Returns an empty string if the cgroup cannot be determined.
+ // Assumes that cgroupsV2Enabled() is enabled.
+ static std::string cgroupv2_of_process();
+
+ // Caveats:
+ // - All of the logic in this file assumes that the current process is the
only process in the
+ // containing cgroup (or more precisely: the only process with
significant memory consumption).
+ // If this is not the case, then other processe's memory consumption may
affect the internal
+ // memory tracker ...
+ // - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is
deprecated for over half a
+ // decade and will go away at some point, hierarchical detection is only
implemented for v2.
+ // - I did not test what happens if a host has v1 and v2 simultaneously
enabled. I believe such
+ // systems existed only for a short transition period.
+ static std::optional<std::string> get_cgroupsv2_path(const std::string&
subsystem);
+
+ // Cgroup file with only one line of numbers.
+ static Status read_int_line_from_cgroup_file(const std::filesystem::path&
file_path,
+ int64_t* val);
+
+ // Multi-line Cgroup files, format is
+ // kernel 5
+ // rss 15
+ // [...]
+ static void read_int_metric_from_cgroup_file(
+ const std::filesystem::path& file_path,
+ std::unordered_map<std::string, int64_t>& metrics_map);
};
} // namespace doris
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index d0703c985ea..8be1db5cb85 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -20,6 +20,8 @@
#include "mem_info.h"
+#include "gutil/strings/split.h"
+
#ifdef __APPLE__
#include <sys/sysctl.h>
#endif
@@ -34,11 +36,10 @@
#include <boost/algorithm/string/trim.hpp>
#include <fstream>
#include <unordered_map>
-#include <vector>
+#include "common/cgroup_memory_ctl.h"
#include "common/config.h"
#include "common/status.h"
-#include "gutil/strings/split.h"
#include "runtime/memory/global_memory_arbitrator.h"
#include "util/cgroup_util.h"
#include "util/parse_util.h"
@@ -75,7 +76,6 @@ std::atomic<int64_t> MemInfo::_s_virtual_memory_used = 0;
int64_t MemInfo::_s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
int64_t MemInfo::_s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
-static std::unordered_map<std::string, int64_t> _s_cgroup_mem_info_bytes;
bool MemInfo::_s_cgroup_mem_refresh_state = false;
int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0;
@@ -179,7 +179,7 @@ void MemInfo::refresh_proc_meminfo() {
if (result == StringParser::PARSE_SUCCESS) {
if (fields.size() == 2) {
_mem_info_bytes[key] = mem_value;
- } else if (fields[2].compare("kB") == 0) {
+ } else if (fields[2] == "kB") {
_mem_info_bytes[key] = mem_value * 1024L;
}
}
@@ -194,65 +194,28 @@ void MemInfo::refresh_proc_meminfo() {
int64_t cgroup_mem_usage = -1;
std::string cgroup_mem_info_file_path;
_s_cgroup_mem_refresh_state = true;
- Status status = CGroupUtil::find_cgroup_mem_limit(&cgroup_mem_limit);
- if (!status.ok() || cgroup_mem_limit <= 0) {
- _s_cgroup_mem_refresh_state = false;
- }
- status = CGroupUtil::find_cgroup_mem_usage(&cgroup_mem_usage);
- if (!status.ok() || cgroup_mem_usage <= 0) {
+ Status status =
CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit);
+ if (!status.ok()) {
_s_cgroup_mem_refresh_state = false;
}
- status = CGroupUtil::find_cgroup_mem_info(&cgroup_mem_info_file_path);
- if (status.ok()) {
- std::ifstream cgroup_meminfo(cgroup_mem_info_file_path,
std::ios::in);
- std::string line;
-
- while (cgroup_meminfo.good() && !cgroup_meminfo.eof()) {
- getline(cgroup_meminfo, line);
- std::vector<std::string> fields =
- strings::Split(line, " ", strings::SkipWhitespace());
- if (fields.size() < 2) {
- continue;
- }
- std::string key = fields[0].substr(0, fields[0].size());
-
- StringParser::ParseResult result;
- auto mem_value =
StringParser::string_to_int<int64_t>(fields[1].data(),
-
fields[1].size(), &result);
-
- if (result == StringParser::PARSE_SUCCESS) {
- if (fields.size() == 2) {
- _s_cgroup_mem_info_bytes[key] = mem_value;
- } else if (fields[2] == "kB") {
- _s_cgroup_mem_info_bytes[key] = mem_value * 1024L;
- }
- }
- }
- if (cgroup_meminfo.is_open()) {
- cgroup_meminfo.close();
- }
- } else {
+ status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
+ if (!status.ok()) {
_s_cgroup_mem_refresh_state = false;
}
if (_s_cgroup_mem_refresh_state) {
_s_cgroup_mem_limit = cgroup_mem_limit;
- //
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
- // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff)
- // so, memory.usage_in_bytes - memory.meminfo["Cached"]
- _s_cgroup_mem_usage = cgroup_mem_usage -
_s_cgroup_mem_info_bytes["cache"];
+ _s_cgroup_mem_usage = cgroup_mem_usage;
// wait 10s, 100 * 100ms, avoid too frequently.
_s_cgroup_mem_refresh_wait_times = -100;
LOG(INFO) << "Refresh cgroup memory win, refresh again after 10s,
cgroup mem limit: "
- << _s_cgroup_mem_limit << ", cgroup mem usage: " <<
_s_cgroup_mem_usage
- << ", cgroup mem info cached: " <<
_s_cgroup_mem_info_bytes["cache"];
+ << _s_cgroup_mem_limit << ", cgroup mem usage: " <<
_s_cgroup_mem_usage;
} else {
// find cgroup failed, wait 300s, 1000 * 100ms.
_s_cgroup_mem_refresh_wait_times = -3000;
LOG(INFO)
<< "Refresh cgroup memory failed, refresh again after
300s, cgroup mem limit: "
- << _s_cgroup_mem_limit << ", cgroup mem usage: " <<
_s_cgroup_mem_usage
- << ", cgroup mem info cached: " <<
_s_cgroup_mem_info_bytes["cache"];
+ << _s_cgroup_mem_limit << ", cgroup mem usage: " <<
_s_cgroup_mem_usage;
}
} else {
if (config::enable_use_cgroup_memory_info) {
@@ -435,7 +398,7 @@ std::string MemInfo::debug_string() {
stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem,
TUnit::BYTES)
<< std::endl;
stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit,
TUnit::BYTES) << std::endl;
- stream << "CGroup Info: " << doris::CGroupUtil::debug_string() <<
std::endl;
+ stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() <<
std::endl;
return stream.str();
}
diff --git a/be/test/util/cgroup_util_test.cpp
b/be/test/util/cgroup_util_test.cpp
index 1553e5f8163..92102120327 100644
--- a/be/test/util/cgroup_util_test.cpp
+++ b/be/test/util/cgroup_util_test.cpp
@@ -23,23 +23,56 @@
#include <fstream>
+#include "common/cgroup_memory_ctl.h"
#include "gtest/gtest_pred_impl.h"
+#include "testutil/test_util.h"
+#include "util/cgroup_util.h"
namespace doris {
class CGroupUtilTest : public ::testing::Test {
protected:
CGroupUtilTest() {}
- virtual ~CGroupUtilTest() {}
+ ~CGroupUtilTest() override = default;
};
+
+TEST_F(CGroupUtilTest, ReadMetrics) {
+ std::string dir_path = GetCurrentRunningDir();
+ std::string memory_limit_in_bytes_path(dir_path);
+ memory_limit_in_bytes_path += "/util/test_data/memory.limit_in_bytes";
+ int64_t value;
+ auto st =
CGroupUtil::read_int_line_from_cgroup_file(memory_limit_in_bytes_path, &value);
+ EXPECT_TRUE(st.ok());
+ EXPECT_EQ(6291456000, value);
+
+ std::string memory_stat_path(dir_path);
+ memory_stat_path += "/util/test_data/memory.stat";
+ std::unordered_map<std::string, int64_t> metrics_map;
+ CGroupUtil::read_int_metric_from_cgroup_file(memory_stat_path,
metrics_map);
+ EXPECT_EQ(5443584, metrics_map["inactive_file"]);
+ EXPECT_EQ(0, metrics_map["rss"]);
+
+ std::string error_memory_limit_in_bytes_path(dir_path);
+ error_memory_limit_in_bytes_path +=
"/util/test_data/Zzz/memory.limit_in_bytes";
+ int64_t error_value;
+ auto st2 =
CGroupUtil::read_int_line_from_cgroup_file(error_memory_limit_in_bytes_path,
+ &error_value);
+ EXPECT_FALSE(st2.ok());
+
+ std::string error_memory_stat_path(dir_path);
+ error_memory_stat_path += "/util/test_data/Zzz/memory.stat";
+ std::unordered_map<std::string, int64_t> error_metrics_map;
+ CGroupUtil::read_int_metric_from_cgroup_file(error_memory_stat_path,
error_metrics_map);
+ EXPECT_TRUE(error_metrics_map.empty());
+}
+
TEST_F(CGroupUtilTest, memlimit) {
- int64_t bytes;
- float cpu_counts;
- CGroupUtil cgroup_util;
- LOG(INFO) << cgroup_util.debug_string();
- Status status1 = cgroup_util.find_cgroup_mem_limit(&bytes);
- Status status2 = cgroup_util.find_cgroup_cpu_limit(&cpu_counts);
- if (cgroup_util.enable()) {
+ LOG(INFO) << CGroupMemoryCtl::debug_string();
+ int64_t mem_limit;
+ int64_t mem_usage;
+ auto status1 = CGroupMemoryCtl::find_cgroup_mem_limit(&mem_limit);
+ auto status2 = CGroupMemoryCtl::find_cgroup_mem_usage(&mem_usage);
+ if (CGroupUtil::cgroupsv1_enable() || CGroupUtil::cgroupsv2_enable()) {
std::ifstream file("/proc/self/cgroup");
if (file.peek() == std::ifstream::traits_type::eof()) {
EXPECT_FALSE(status1.ok());
diff --git a/be/test/util/test_data/memory.limit_in_bytes
b/be/test/util/test_data/memory.limit_in_bytes
new file mode 100644
index 00000000000..2081025feea
--- /dev/null
+++ b/be/test/util/test_data/memory.limit_in_bytes
@@ -0,0 +1 @@
+6291456000
diff --git a/be/test/util/test_data/memory.stat
b/be/test/util/test_data/memory.stat
new file mode 100644
index 00000000000..858c8433005
--- /dev/null
+++ b/be/test/util/test_data/memory.stat
@@ -0,0 +1,36 @@
+cache 19202048
+rss 0
+rss_huge 0
+shmem 0
+mapped_file 5271552
+dirty 135168
+writeback 0
+swap 0
+pgpgin 14061432
+pgpgout 14058118
+pgfault 17403111
+pgmajfault 15180
+inactive_anon 0
+active_anon 2494464
+inactive_file 5443584
+active_file 10158080
+unevictable 0
+hierarchical_memory_limit 6291456000
+hierarchical_memsw_limit 9223372036854771712
+total_cache 19202048
+total_rss 0
+total_rss_huge 0
+total_shmem 0
+total_mapped_file 5271552
+total_dirty 135168
+total_writeback 0
+total_swap 0
+total_pgpgin 14061432
+total_pgpgout 14058118
+total_pgfault 17403111
+total_pgmajfault 15180
+total_inactive_anon 0
+total_active_anon 2494464
+total_inactive_file 5443584
+total_active_file 10158080
+total_unevictable 0
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]