This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit dced8ca27c5236ee64d5c25e6b87c44a5b3c9814 Author: Michael Smith <[email protected]> AuthorDate: Wed Jun 21 16:58:43 2023 -0700 IMPALA-12217: Update cgroup-util to handle cgroups v2 RedHat 9 and Ubuntu 22 switch to cgroups v2, which has a different hierarchy than cgroups v1. Ubuntu 20 has a hybrid layout with both cgroup and cgroup2 mounted, but the cgroup2 functionality is limited. Updates cgroup-util to - identify available cgroups in FindCGroupMounts. Prefers v1 if available, as Ubuntu 20's hybrid layout provides only limited v2 interfaces. - refactors file reading to follow guidelines from https://gehrcke.de/2011/06/reading-files-in-c-using-ifstream-dealing-correctly-with-badbit-failbit-eofbit-and-perror/ for clearer error handling. Specifically, failbit doesn't set errno, but we were printing it anyway (which produced misleading errors). - updates FindCGroupMemLimit to read memory.max for cgroups v2. - updates DebugString to print the correct property based on cgroup version. Removes unused cgroups test library. Testing: - proc-info-test CGroupInfo.ErrorHandling test on RHEL 9 and Ubuntu 20. - verified no error messages related to reading cgroup present in logs on RHEL 9 and Ubuntu 20. Change-Id: I8dc499bd1b490970d30ed6dcd2d16d14ab41ee8c Reviewed-on: http://gerrit.cloudera.org:8080/20105 Reviewed-by: Yida Wu <[email protected]> Reviewed-by: Joe McDonnell <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/util/cgroup-util.cc | 149 +++++++++++++++++++++++++++--------------- be/src/util/cgroup-util.h | 23 +++---- be/src/util/proc-info-test.cc | 20 +++--- bin/start-impala-cluster.py | 1 - testdata/common/cgroups.py | 89 ------------------------- 5 files changed, 117 insertions(+), 165 deletions(-) diff --git a/be/src/util/cgroup-util.cc b/be/src/util/cgroup-util.cc index 0b09b95ad..e387816e6 100644 --- a/be/src/util/cgroup-util.cc +++ b/be/src/util/cgroup-util.cc @@ -19,7 +19,6 @@ #include <algorithm> #include <fstream> -#include <utility> #include <vector> #include <boost/algorithm/string/classification.hpp> @@ -38,25 +37,21 @@ using boost::algorithm::is_any_of; using boost::algorithm::split; using boost::algorithm::token_compress_on; using strings::CUnescape; -using std::pair; namespace impala { Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) { ifstream proc_cgroups("/proc/self/cgroup", ios::in); + if (!proc_cgroups.is_open()) { + return Status(Substitute("Error opening /proc/self/cgroup: $0", GetStrErrMsg())); + } + string line; - while (true) { - if (proc_cgroups.fail() || proc_cgroups.bad()) { - return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg())); - } else if (proc_cgroups.eof()) { - return Status( - Substitute("Could not find subsystem $0 in /proc/self/cgroup", subsystem)); - } + while (getline(proc_cgroups, line)) { // The line format looks like this: // 4:memory:/user.slice // 9:cpu,cpuacct:/user.slice - getline(proc_cgroups, line); - if (!proc_cgroups.good()) continue; + // 0::/user.slice vector<string> fields; split(fields, line, is_any_of(":")); DCHECK_GE(fields.size(), 3); @@ -75,6 +70,15 @@ Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) { return Status::OK(); } } + + if (proc_cgroups.bad()) { + // Only badbit sets errno. + return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg())); + } else { + DCHECK(proc_cgroups.eof()); + return Status( + Substitute("Could not find subsystem '$0' in /proc/self/cgroup", subsystem)); + } } static Status UnescapePath(const string& escaped, string* unescaped) { @@ -86,16 +90,15 @@ static Status UnescapePath(const string& escaped, string* unescaped) { } Status CGroupUtil::FindCGroupMounts( - const string& subsystem, pair<string, string>* result) { + const string& subsystem, string* mount_path, string* system_path, bool* is_v2) { ifstream mountinfo("/proc/self/mountinfo", ios::in); + if (!mountinfo.is_open()) { + return Status(Substitute("Error opening /proc/self/mountinfo: $0", GetStrErrMsg())); + } + string line; - while (true) { - if (mountinfo.fail() || mountinfo.bad()) { - return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg())); - } else if (mountinfo.eof()) { - return Status( - Substitute("Could not find subsystem $0 in /proc/self/mountinfo", subsystem)); - } + bool found_cgroup_v2 = false; + while (getline(mountinfo, line)) { // The relevant lines look like below (see proc manpage for full documentation). The // first example is running outside of a container, the second example is running // inside a docker container. Field 3 is the path relative to the root CGroup on @@ -104,54 +107,92 @@ Status CGroupUtil::FindCGroupMounts( // cgroup cgroup rw,memory // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory // ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory - getline(mountinfo, line); - if (!mountinfo.good()) continue; + // + // CGroup v2 will contain only a single entry marked cgroup2 + // 29 23 0:26 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 - + // cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot + // Some systems (Ubuntu) list both, but CGroup v2 isn't usable, so if we identify + // a subsystem mount return that, else if we see a CGroup v2 entry return it. + // 34 24 0:29 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 - + // tmpfs tmpfs ro,mode=755,inode64 + // 35 34 0:30 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 - + // cgroup2 cgroup2 rw,nsdelegate + // 50 34 0:45 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:26 - + // cgroup cgroup rw,memory vector<string> fields; split(fields, line, is_any_of(" "), token_compress_on); DCHECK_GE(fields.size(), 7); - if (fields[fields.size() - 3] != "cgroup") continue; - // This is a cgroup mount. Check if it's the mount we're looking for. - vector<string> cgroup_opts; - split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on); - auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem); - if (it == cgroup_opts.end()) continue; + bool is_cgroup_v1 = fields[fields.size() - 3] == "cgroup"; + if (!is_cgroup_v1 && fields[fields.size() - 3] != "cgroup2") continue; + + // This is a cgroup mount. + if (is_cgroup_v1) { + // Check if it's the subsystem we're looking for. + vector<string> cgroup_opts; + split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on); + auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem); + if (it == cgroup_opts.end()) continue; + } + // This is the right mount. - string mount_path, system_path; - RETURN_IF_ERROR(UnescapePath(fields[4], &mount_path)); - RETURN_IF_ERROR(UnescapePath(fields[3], &system_path)); + RETURN_IF_ERROR(UnescapePath(fields[4], mount_path)); + RETURN_IF_ERROR(UnescapePath(fields[3], system_path)); // Strip trailing "/" so that both returned paths match in whether they have a // trailing "/". - if (system_path[system_path.size() - 1] == '/') system_path.pop_back(); - *result = {mount_path, system_path}; - return Status::OK(); + if (system_path->back() == '/') system_path->pop_back(); + if (is_cgroup_v1) { + *is_v2 = false; + return Status::OK(); + } else { + // Keep looking in case there are also CGroupv1 mounts. + found_cgroup_v2 = true; + } } -} -Status CGroupUtil::FindAbsCGroupPath(const string& subsystem, string* path) { - RETURN_IF_ERROR(FindGlobalCGroup(subsystem, path)); - pair<string, string> paths; - RETURN_IF_ERROR(FindCGroupMounts(subsystem, &paths)); - const string& mount_path = paths.first; - const string& system_path = paths.second; - if (path->compare(0, system_path.size(), system_path) != 0) { + if (mountinfo.bad()) { + // Only badbit sets errno. + return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg())); + } else if (found_cgroup_v2) { + *is_v2 = true; + return Status::OK(); + } else { + DCHECK(mountinfo.eof()); return Status( - Substitute("Expected CGroup path '$0' to start with '$1'", *path, system_path)); + Substitute("Could not find subsystem '$0' in /proc/self/mountinfo", subsystem)); } - path->replace(0, system_path.size(), mount_path); - return Status::OK(); } -Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes) { - string cgroup_path; - RETURN_IF_ERROR(FindAbsCGroupPath("memory", &cgroup_path)); - string limit_file_path = cgroup_path + "/memory.limit_in_bytes"; - ifstream limit_file(limit_file_path, ios::in); - string line; +Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes, bool* is_v2) { + string mount_path, system_path, cgroup_path, line; + bool local_is_v2; + RETURN_IF_ERROR(FindCGroupMounts("memory", &mount_path, &system_path, &local_is_v2)); + if (is_v2 != nullptr) { + *is_v2 = local_is_v2; + } + RETURN_IF_ERROR(FindGlobalCGroup(local_is_v2 ? "" : "memory", &cgroup_path)); + if (cgroup_path.compare(0, system_path.size(), system_path) != 0) { + return Status(Substitute( + "Expected CGroup path '$0' to start with '$1'", cgroup_path, system_path)); + } + cgroup_path.replace(0, system_path.size(), mount_path); + + string limit_file_name = local_is_v2 ? "memory.max" : "memory.limit_in_bytes"; + string limit_file_path = cgroup_path + "/" + limit_file_name; + ifstream limit_file(limit_file_path); + if (!limit_file.is_open()) { + return Status(Substitute("Error opening $0: $1", limit_file_path, GetStrErrMsg())); + } getline(limit_file, line); - if (limit_file.fail() || limit_file.bad()) { + if (limit_file.bad()) { return Status(Substitute("Error reading $0: $1", limit_file_path, GetStrErrMsg())); } + + if (line == "max") { + *bytes = std::numeric_limits<int64_t>::max(); + return Status::OK(); + } + StringParser::ParseResult pr; // Parse into an an int64_t, since that is what we use for memory amounts elsewhere in // the codebase. If it overflows, returning the max value of int64_t is ok because that @@ -168,13 +209,15 @@ Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes) { std::string CGroupUtil::DebugString() { string mem_limit_str; int64_t mem_limit; - Status status = FindCGroupMemLimit(&mem_limit); + bool is_v2; + Status status = FindCGroupMemLimit(&mem_limit, &is_v2); if (status.ok()) { mem_limit_str = Substitute("$0", mem_limit); } else { mem_limit_str = status.GetDetail(); } - return Substitute("Process CGroup Info: memory.limit_in_bytes=$0", mem_limit_str); + string property = is_v2 ? "memory.max" : "memory.limit_in_bytes"; + return Substitute("Process CGroup Info: $0=$1", property, mem_limit_str); } } // namespace impala diff --git a/be/src/util/cgroup-util.h b/be/src/util/cgroup-util.h index 7a4ccbe94..4540dccca 100644 --- a/be/src/util/cgroup-util.h +++ b/be/src/util/cgroup-util.h @@ -19,7 +19,6 @@ #include <cstdint> #include <string> -#include <utility> #include "common/status.h" @@ -30,8 +29,9 @@ class CGroupUtil { /// Determines the CGroup memory limit from the current processes' cgroup. /// If the limit is more than INT64_MAX, INT64_MAX is returned (since that is /// effectively unlimited anyway). Does not take into account memory limits - /// set on any ancestor CGroups. - static Status FindCGroupMemLimit(int64_t* bytes); + /// set on any ancestor CGroups. If is_v2 is provided: will return true when + /// CGroup v2 is used, false otherwise. + static Status FindCGroupMemLimit(int64_t* bytes, bool* is_v2 = nullptr); /// Returns a human-readable string with information about CGroups. static std::string DebugString(); @@ -41,22 +41,17 @@ class CGroupUtil { /// Finds the path of the cgroup of 'subsystem' for the current process. /// E.g. FindGlobalCGroup("memory") will return the memory cgroup /// that this process belongs to. This is a path relative to the system-wide root - /// cgroup for 'subsystem'. + /// cgroup for 'subsystem'. Pass subsystem="" for CGroup V2 unified entries. static Status FindGlobalCGroup(const std::string& subsystem, std::string* path); - /// Returns the absolute path to the CGroup from inside the container. - /// E.g. if this process belongs to - /// /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id>, which is mounted at - /// /sys/fs/cgroup/memory inside the container, this function returns - /// "/sys/fs/cgroup/memory". - static Status FindAbsCGroupPath(const std::string& subsystem, std::string* path); - /// Figures out the mapping of the cgroup root from the container's point of view to /// the full path relative to the system-wide cgroups outside of the container. /// E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id> may be mounted at /// /sys/fs/cgroup/memory inside the container. In that case this function would return - /// ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-<long unique id>"). - static Status FindCGroupMounts( - const std::string& subsystem, std::pair<std::string, std::string>* result); + /// ("/sys/fs/cgroup/memory", "/kubepods/burstable/pod-<long unique id>"). + /// CGroup v2 uses a unified hierarchy, so the result will be the same for any + /// value of 'subsystem'. Returns whether the mount is for cgroup v1 or v2. + static Status FindCGroupMounts(const std::string& subsystem, + std::string* mount_path, std::string* system_path, bool* is_v2); }; } // namespace impala diff --git a/be/src/util/proc-info-test.cc b/be/src/util/proc-info-test.cc index 87ea572ab..4d4983085 100644 --- a/be/src/util/proc-info-test.cc +++ b/be/src/util/proc-info-test.cc @@ -41,15 +41,19 @@ TEST(CGroupInfo, Basic) { // Test error handling when cgroup is not present. TEST(CGroupInfo, ErrorHandling) { + string mp, sp; + bool isV2; + Status err = CGroupUtil::FindCGroupMounts("fake-cgroup", &mp, &sp, &isV2); + if (isV2) { + // Ignores subsystem, so should always succeed. + EXPECT_TRUE(err.ok()) << err; + } else { + EXPECT_FALSE(err.ok()) << err; + } string path; - Status err = CGroupUtil::FindGlobalCGroup("fake-cgroup", &path); - LOG(INFO) << err.msg().msg(); - EXPECT_FALSE(err.ok()); - err = CGroupUtil::FindAbsCGroupPath("fake-cgroup", &path); - EXPECT_FALSE(err.ok()); - pair<string, string> p; - err = CGroupUtil::FindCGroupMounts("fake-cgroup", &p); - EXPECT_FALSE(err.ok()); + err = CGroupUtil::FindGlobalCGroup("fake-cgroup", &path); + // Always fails; v2 lists an empty subsystem. + EXPECT_FALSE(err.ok()) << err; } TEST(ProcessStateInfo, Basic) { diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py index 932310cbe..4b0883dc2 100755 --- a/bin/start-impala-cluster.py +++ b/bin/start-impala-cluster.py @@ -35,7 +35,6 @@ from getpass import getuser from time import sleep, time from optparse import OptionParser, SUPPRESS_HELP from subprocess import call, check_call, check_output -from testdata.common import cgroups from tests.common.environ import build_flavor_timeout from tests.common.impala_cluster import (ImpalaCluster, DEFAULT_BEESWAX_PORT, DEFAULT_HS2_PORT, DEFAULT_KRPC_PORT, DEFAULT_HS2_HTTP_PORT, diff --git a/testdata/common/cgroups.py b/testdata/common/cgroups.py deleted file mode 100755 index 5e5d5c048..000000000 --- a/testdata/common/cgroups.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env impala-python -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Utility code for creating cgroups for the Impala development environment. -# May be used as a library or as a command-line utility for manual testing. -from __future__ import absolute_import, division, print_function -from builtins import range -import os -import sys -import errno - -from optparse import OptionParser - -# Options -parser = OptionParser() -parser.add_option("-s", "--cluster_size", type="int", dest="cluster_size", default=3, - help="Size of the cluster (number of impalad instances to start).") - -def get_cpu_controller_root(): - """Returns the filesystem path of the CPU cgroup controller. - Currently assumes the CPU controller is mounted in the standard location. - TODO: Read /etc/mounts to find where cpu controller is mounted. - """ - CGROUP_CPU_ROOT = "/sys/fs/cgroup/cpu" - if not os.path.isdir(CGROUP_CPU_ROOT): - raise Exception("Cgroup CPU controller is not mounted at %s" % (CGROUP_CPU_ROOT)) - return CGROUP_CPU_ROOT - -def get_session_cpu_path(): - """Returns the path of the CPU cgroup hierarchy for this session, which is writable - by the impalad processes. The cgroup hierarchy is specified as an absolute path - under the CPU controller root. - """ - PROC_SELF_CGROUP = '/proc/self/cgroup' - cgroup_paths = open(PROC_SELF_CGROUP) - try: - for line in cgroup_paths: - parts = line.strip().split(':') - if len(parts) == 3 and parts[1] == 'cpu': - return parts[2] - finally: - cgroup_paths.close() - raise Exception("Process cgroup CPU hierarchy not found in %s" % (PROC_SELF_CGROUP)) - -def create_impala_cgroup_path(instance_num): - """Returns the full filesystem path of a CPU controller cgroup hierarchy which is - writeable by an impalad. The base cgroup path is read from the environment variable - IMPALA_CGROUP_BASE_PATH if it is set, otherwise it is set to a child of the path of - the cgroup for this process. - - instance_num is used to provide different (sibiling) cgroups for each impalad - instance. The returned cgroup is created if necessary. - """ - parent_cgroup = os.getenv('IMPALA_CGROUP_BASE_PATH') - if parent_cgroup is None: - # Join root path with the cpu hierarchy path by concatenting the strings. Can't use - # path.join() because the session cpu hierarchy path looks like an absolute FS path. - parent_cgroup = "%s%s" % (get_cpu_controller_root(), get_session_cpu_path()) - cgroup_path = os.path.join(parent_cgroup, ("impala-%s" % instance_num)) - try: - os.makedirs(cgroup_path) - except OSError as ex: - if ex.errno == errno.EEXIST and os.path.isdir(cgroup_path): - pass - else: raise - return cgroup_path - -if __name__ == "__main__": - if options.cluster_size < 0: - print('Please specify a cluster size >= 0') - sys.exit(1) - for i in range(options.cluster_size): - create_impala_cgroup_path(i)
