This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit dced8ca27c5236ee64d5c25e6b87c44a5b3c9814
Author: Michael Smith <[email protected]>
AuthorDate: Wed Jun 21 16:58:43 2023 -0700

    IMPALA-12217: Update cgroup-util to handle cgroups v2
    
    RedHat 9 and Ubuntu 22 switch to cgroups v2, which has a different
    hierarchy than cgroups v1. Ubuntu 20 has a hybrid layout with both
    cgroup and cgroup2 mounted, but the cgroup2 functionality is limited.
    
    Updates cgroup-util to
    - identify available cgroups in FindCGroupMounts. Prefers v1 if
      available, as Ubuntu 20's hybrid layout provides only limited v2
      interfaces.
    - refactors file reading to follow guidelines from
      
https://gehrcke.de/2011/06/reading-files-in-c-using-ifstream-dealing-correctly-with-badbit-failbit-eofbit-and-perror/
      for clearer error handling. Specifically, failbit doesn't set errno, but
      we were printing it anyway (which produced misleading errors).
    - updates FindCGroupMemLimit to read memory.max for cgroups v2.
    - updates DebugString to print the correct property based on cgroup
      version.
    
    Removes unused cgroups test library.
    
    Testing:
    - proc-info-test CGroupInfo.ErrorHandling test on RHEL 9 and Ubuntu 20.
    - verified no error messages related to reading cgroup present in logs
      on RHEL 9 and Ubuntu 20.
    
    Change-Id: I8dc499bd1b490970d30ed6dcd2d16d14ab41ee8c
    Reviewed-on: http://gerrit.cloudera.org:8080/20105
    Reviewed-by: Yida Wu <[email protected]>
    Reviewed-by: Joe McDonnell <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/util/cgroup-util.cc    | 149 +++++++++++++++++++++++++++---------------
 be/src/util/cgroup-util.h     |  23 +++----
 be/src/util/proc-info-test.cc |  20 +++---
 bin/start-impala-cluster.py   |   1 -
 testdata/common/cgroups.py    |  89 -------------------------
 5 files changed, 117 insertions(+), 165 deletions(-)

diff --git a/be/src/util/cgroup-util.cc b/be/src/util/cgroup-util.cc
index 0b09b95ad..e387816e6 100644
--- a/be/src/util/cgroup-util.cc
+++ b/be/src/util/cgroup-util.cc
@@ -19,7 +19,6 @@
 
 #include <algorithm>
 #include <fstream>
-#include <utility>
 #include <vector>
 
 #include <boost/algorithm/string/classification.hpp>
@@ -38,25 +37,21 @@ using boost::algorithm::is_any_of;
 using boost::algorithm::split;
 using boost::algorithm::token_compress_on;
 using strings::CUnescape;
-using std::pair;
 
 namespace impala {
 
 Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) {
   ifstream proc_cgroups("/proc/self/cgroup", ios::in);
+  if (!proc_cgroups.is_open()) {
+    return Status(Substitute("Error opening /proc/self/cgroup: $0", 
GetStrErrMsg()));
+  }
+
   string line;
-  while (true) {
-    if (proc_cgroups.fail() || proc_cgroups.bad()) {
-      return Status(Substitute("Error reading /proc/self/cgroup: $0", 
GetStrErrMsg()));
-    } else if (proc_cgroups.eof()) {
-      return Status(
-          Substitute("Could not find subsystem $0 in /proc/self/cgroup", 
subsystem));
-    }
+  while (getline(proc_cgroups, line)) {
     // The line format looks like this:
     // 4:memory:/user.slice
     // 9:cpu,cpuacct:/user.slice
-    getline(proc_cgroups, line);
-    if (!proc_cgroups.good()) continue;
+    // 0::/user.slice
     vector<string> fields;
     split(fields, line, is_any_of(":"));
     DCHECK_GE(fields.size(), 3);
@@ -75,6 +70,15 @@ Status CGroupUtil::FindGlobalCGroup(const string& subsystem, 
string* path) {
       return Status::OK();
     }
   }
+
+  if (proc_cgroups.bad()) {
+    // Only badbit sets errno.
+    return Status(Substitute("Error reading /proc/self/cgroup: $0", 
GetStrErrMsg()));
+  } else {
+    DCHECK(proc_cgroups.eof());
+    return Status(
+        Substitute("Could not find subsystem '$0' in /proc/self/cgroup", 
subsystem));
+  }
 }
 
 static Status UnescapePath(const string& escaped, string* unescaped) {
@@ -86,16 +90,15 @@ static Status UnescapePath(const string& escaped, string* 
unescaped) {
 }
 
 Status CGroupUtil::FindCGroupMounts(
-    const string& subsystem, pair<string, string>* result) {
+    const string& subsystem, string* mount_path, string* system_path, bool* 
is_v2) {
   ifstream mountinfo("/proc/self/mountinfo", ios::in);
+  if (!mountinfo.is_open()) {
+    return Status(Substitute("Error opening /proc/self/mountinfo: $0", 
GetStrErrMsg()));
+  }
+
   string line;
-  while (true) {
-    if (mountinfo.fail() || mountinfo.bad()) {
-      return Status(Substitute("Error reading /proc/self/mountinfo: $0", 
GetStrErrMsg()));
-    } else if (mountinfo.eof()) {
-      return Status(
-          Substitute("Could not find subsystem $0 in /proc/self/mountinfo", 
subsystem));
-    }
+  bool found_cgroup_v2 = false;
+  while (getline(mountinfo, line)) {
     // The relevant lines look like below (see proc manpage for full 
documentation). The
     // first example is running outside of a container, the second example is 
running
     // inside a docker container. Field 3 is the path relative to the root 
CGroup on
@@ -104,54 +107,92 @@ Status CGroupUtil::FindCGroupMounts(
     //    cgroup cgroup rw,memory
     // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
     //    ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory
-    getline(mountinfo, line);
-    if (!mountinfo.good()) continue;
+    //
+    // CGroup v2 will contain only a single entry marked cgroup2
+    // 29 23 0:26 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 -
+    //    cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
+    // Some systems (Ubuntu) list both, but CGroup v2 isn't usable, so if we 
identify
+    // a subsystem mount return that, else if we see a CGroup v2 entry return 
it.
+    // 34 24 0:29 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 -
+    //    tmpfs tmpfs ro,mode=755,inode64
+    // 35 34 0:30 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime 
shared:10 -
+    //    cgroup2 cgroup2 rw,nsdelegate
+    // 50 34 0:45 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime 
shared:26 -
+    //    cgroup cgroup rw,memory
     vector<string> fields;
     split(fields, line, is_any_of(" "), token_compress_on);
     DCHECK_GE(fields.size(), 7);
 
-    if (fields[fields.size() - 3] != "cgroup") continue;
-    // This is a cgroup mount. Check if it's the mount we're looking for.
-    vector<string> cgroup_opts;
-    split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), 
token_compress_on);
-    auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
-    if (it == cgroup_opts.end()) continue;
+    bool is_cgroup_v1 = fields[fields.size() - 3] == "cgroup";
+    if (!is_cgroup_v1 && fields[fields.size() - 3] != "cgroup2") continue;
+
+    // This is a cgroup mount.
+    if (is_cgroup_v1) {
+      // Check if it's the subsystem we're looking for.
+      vector<string> cgroup_opts;
+      split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), 
token_compress_on);
+      auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
+      if (it == cgroup_opts.end()) continue;
+    }
+
     // This is the right mount.
-    string mount_path, system_path;
-    RETURN_IF_ERROR(UnescapePath(fields[4], &mount_path));
-    RETURN_IF_ERROR(UnescapePath(fields[3], &system_path));
+    RETURN_IF_ERROR(UnescapePath(fields[4], mount_path));
+    RETURN_IF_ERROR(UnescapePath(fields[3], system_path));
     // Strip trailing "/" so that both returned paths match in whether they 
have a
     // trailing "/".
-    if (system_path[system_path.size() - 1] == '/') system_path.pop_back();
-    *result = {mount_path, system_path};
-    return Status::OK();
+    if (system_path->back() == '/') system_path->pop_back();
+    if (is_cgroup_v1) {
+      *is_v2 = false;
+      return Status::OK();
+    } else {
+      // Keep looking in case there are also CGroupv1 mounts.
+      found_cgroup_v2 = true;
+    }
   }
-}
 
-Status CGroupUtil::FindAbsCGroupPath(const string& subsystem, string* path) {
-  RETURN_IF_ERROR(FindGlobalCGroup(subsystem, path));
-  pair<string, string> paths;
-  RETURN_IF_ERROR(FindCGroupMounts(subsystem, &paths));
-  const string& mount_path = paths.first;
-  const string& system_path = paths.second;
-  if (path->compare(0, system_path.size(), system_path) != 0) {
+  if (mountinfo.bad()) {
+    // Only badbit sets errno.
+    return Status(Substitute("Error reading /proc/self/mountinfo: $0", 
GetStrErrMsg()));
+  } else if (found_cgroup_v2) {
+    *is_v2 = true;
+    return Status::OK();
+  } else {
+    DCHECK(mountinfo.eof());
     return Status(
-        Substitute("Expected CGroup path '$0' to start with '$1'", *path, 
system_path));
+        Substitute("Could not find subsystem '$0' in /proc/self/mountinfo", 
subsystem));
   }
-  path->replace(0, system_path.size(), mount_path);
-  return Status::OK();
 }
 
-Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes) {
-  string cgroup_path;
-  RETURN_IF_ERROR(FindAbsCGroupPath("memory", &cgroup_path));
-  string limit_file_path = cgroup_path + "/memory.limit_in_bytes";
-  ifstream limit_file(limit_file_path, ios::in);
-  string line;
+Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes, bool* is_v2) {
+  string mount_path, system_path, cgroup_path, line;
+  bool local_is_v2;
+  RETURN_IF_ERROR(FindCGroupMounts("memory", &mount_path, &system_path, 
&local_is_v2));
+  if (is_v2 != nullptr) {
+    *is_v2 = local_is_v2;
+  }
+  RETURN_IF_ERROR(FindGlobalCGroup(local_is_v2 ? "" : "memory", &cgroup_path));
+  if (cgroup_path.compare(0, system_path.size(), system_path) != 0) {
+    return Status(Substitute(
+        "Expected CGroup path '$0' to start with '$1'", cgroup_path, 
system_path));
+  }
+  cgroup_path.replace(0, system_path.size(), mount_path);
+
+  string limit_file_name = local_is_v2 ? "memory.max" : 
"memory.limit_in_bytes";
+  string limit_file_path = cgroup_path + "/" + limit_file_name;
+  ifstream limit_file(limit_file_path);
+  if (!limit_file.is_open()) {
+    return Status(Substitute("Error opening $0: $1", limit_file_path, 
GetStrErrMsg()));
+  }
   getline(limit_file, line);
-  if (limit_file.fail() || limit_file.bad()) {
+  if (limit_file.bad()) {
     return Status(Substitute("Error reading $0: $1", limit_file_path, 
GetStrErrMsg()));
   }
+
+  if (line == "max") {
+    *bytes = std::numeric_limits<int64_t>::max();
+    return Status::OK();
+  }
+
   StringParser::ParseResult pr;
   // Parse into an an int64_t, since that is what we use for memory amounts 
elsewhere in
   // the codebase. If it overflows, returning the max value of int64_t is ok 
because that
@@ -168,13 +209,15 @@ Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes) {
 std::string CGroupUtil::DebugString() {
   string mem_limit_str;
   int64_t mem_limit;
-  Status status = FindCGroupMemLimit(&mem_limit);
+  bool is_v2;
+  Status status = FindCGroupMemLimit(&mem_limit, &is_v2);
   if (status.ok()) {
     mem_limit_str = Substitute("$0", mem_limit);
   } else {
     mem_limit_str = status.GetDetail();
   }
-  return Substitute("Process CGroup Info: memory.limit_in_bytes=$0", 
mem_limit_str);
+  string property = is_v2 ? "memory.max" : "memory.limit_in_bytes";
+  return Substitute("Process CGroup Info: $0=$1", property, mem_limit_str);
 }
 
 } // namespace impala
diff --git a/be/src/util/cgroup-util.h b/be/src/util/cgroup-util.h
index 7a4ccbe94..4540dccca 100644
--- a/be/src/util/cgroup-util.h
+++ b/be/src/util/cgroup-util.h
@@ -19,7 +19,6 @@
 
 #include <cstdint>
 #include <string>
-#include <utility>
 
 #include "common/status.h"
 
@@ -30,8 +29,9 @@ class CGroupUtil {
   /// Determines the CGroup memory limit from the current processes' cgroup.
   /// If the limit is more than INT64_MAX, INT64_MAX is returned (since that is
   /// effectively unlimited anyway). Does not take into account memory limits
-  /// set on any ancestor CGroups.
-  static Status FindCGroupMemLimit(int64_t* bytes);
+  /// set on any ancestor CGroups. If is_v2 is provided: will return true when
+  /// CGroup v2 is used, false otherwise.
+  static Status FindCGroupMemLimit(int64_t* bytes, bool* is_v2 = nullptr);
 
   /// Returns a human-readable string with information about CGroups.
   static std::string DebugString();
@@ -41,22 +41,17 @@ class CGroupUtil {
   /// Finds the path of the cgroup of 'subsystem' for the current process.
   /// E.g. FindGlobalCGroup("memory") will return the memory cgroup
   /// that this process belongs to. This is a path relative to the system-wide 
root
-  /// cgroup for 'subsystem'.
+  /// cgroup for 'subsystem'. Pass subsystem="" for CGroup V2 unified entries.
   static Status FindGlobalCGroup(const std::string& subsystem, std::string* 
path);
 
-  /// Returns the absolute path to the CGroup from inside the container.
-  /// E.g. if this process belongs to
-  /// /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id>, which is 
mounted at
-  /// /sys/fs/cgroup/memory inside the container, this function returns
-  /// "/sys/fs/cgroup/memory".
-  static Status FindAbsCGroupPath(const std::string& subsystem, std::string* 
path);
-
   /// Figures out the mapping of the cgroup root from the container's point of 
view to
   /// the full path relative to the system-wide cgroups outside of the 
container.
   /// E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id> may 
be mounted at
   /// /sys/fs/cgroup/memory inside the container. In that case this function 
would return
-  /// ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-<long unique id>").
-  static Status FindCGroupMounts(
-      const std::string& subsystem, std::pair<std::string, std::string>* 
result);
+  /// ("/sys/fs/cgroup/memory", "/kubepods/burstable/pod-<long unique id>").
+  /// CGroup v2 uses a unified hierarchy, so the result will be the same for 
any
+  /// value of 'subsystem'. Returns whether the mount is for cgroup v1 or v2.
+  static Status FindCGroupMounts(const std::string& subsystem,
+      std::string* mount_path, std::string* system_path, bool* is_v2);
 };
 } // namespace impala
diff --git a/be/src/util/proc-info-test.cc b/be/src/util/proc-info-test.cc
index 87ea572ab..4d4983085 100644
--- a/be/src/util/proc-info-test.cc
+++ b/be/src/util/proc-info-test.cc
@@ -41,15 +41,19 @@ TEST(CGroupInfo, Basic) {
 
 // Test error handling when cgroup is not present.
 TEST(CGroupInfo, ErrorHandling) {
+  string mp, sp;
+  bool isV2;
+  Status err = CGroupUtil::FindCGroupMounts("fake-cgroup", &mp, &sp, &isV2);
+  if (isV2) {
+    // Ignores subsystem, so should always succeed.
+    EXPECT_TRUE(err.ok()) << err;
+  } else {
+    EXPECT_FALSE(err.ok()) << err;
+  }
   string path;
-  Status err = CGroupUtil::FindGlobalCGroup("fake-cgroup", &path);
-  LOG(INFO) << err.msg().msg();
-  EXPECT_FALSE(err.ok());
-  err = CGroupUtil::FindAbsCGroupPath("fake-cgroup", &path);
-  EXPECT_FALSE(err.ok());
-  pair<string, string> p;
-  err = CGroupUtil::FindCGroupMounts("fake-cgroup", &p);
-  EXPECT_FALSE(err.ok());
+  err = CGroupUtil::FindGlobalCGroup("fake-cgroup", &path);
+  // Always fails; v2 lists an empty subsystem.
+  EXPECT_FALSE(err.ok()) << err;
 }
 
 TEST(ProcessStateInfo, Basic) {
diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index 932310cbe..4b0883dc2 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -35,7 +35,6 @@ from getpass import getuser
 from time import sleep, time
 from optparse import OptionParser, SUPPRESS_HELP
 from subprocess import call, check_call, check_output
-from testdata.common import cgroups
 from tests.common.environ import build_flavor_timeout
 from tests.common.impala_cluster import (ImpalaCluster, DEFAULT_BEESWAX_PORT,
     DEFAULT_HS2_PORT, DEFAULT_KRPC_PORT, DEFAULT_HS2_HTTP_PORT,
diff --git a/testdata/common/cgroups.py b/testdata/common/cgroups.py
deleted file mode 100755
index 5e5d5c048..000000000
--- a/testdata/common/cgroups.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env impala-python
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Utility code for creating cgroups for the Impala development environment.
-# May be used as a library or as a command-line utility for manual testing.
-from __future__ import absolute_import, division, print_function
-from builtins import range
-import os
-import sys
-import errno
-
-from optparse import OptionParser
-
-# Options
-parser = OptionParser()
-parser.add_option("-s", "--cluster_size", type="int", dest="cluster_size", 
default=3,
-                  help="Size of the cluster (number of impalad instances to 
start).")
-
-def get_cpu_controller_root():
-  """Returns the filesystem path of the CPU cgroup controller.
-     Currently assumes the CPU controller is mounted in the standard location.
-     TODO: Read /etc/mounts to find where cpu controller is mounted.
-  """
-  CGROUP_CPU_ROOT = "/sys/fs/cgroup/cpu"
-  if not os.path.isdir(CGROUP_CPU_ROOT):
-    raise Exception("Cgroup CPU controller is not mounted at %s" % 
(CGROUP_CPU_ROOT))
-  return CGROUP_CPU_ROOT
-
-def get_session_cpu_path():
-  """Returns the path of the CPU cgroup hierarchy for this session, which is 
writable
-     by the impalad processes. The cgroup hierarchy is specified as an 
absolute path
-     under the CPU controller root.
-  """
-  PROC_SELF_CGROUP = '/proc/self/cgroup'
-  cgroup_paths = open(PROC_SELF_CGROUP)
-  try:
-    for line in cgroup_paths:
-      parts = line.strip().split(':')
-      if len(parts) == 3 and parts[1] == 'cpu':
-        return parts[2]
-  finally:
-    cgroup_paths.close()
-  raise Exception("Process cgroup CPU hierarchy not found in %s" % 
(PROC_SELF_CGROUP))
-
-def create_impala_cgroup_path(instance_num):
-  """Returns the full filesystem path of a CPU controller cgroup hierarchy 
which is
-     writeable by an impalad. The base cgroup path is read from the 
environment variable
-     IMPALA_CGROUP_BASE_PATH if it is set, otherwise it is set to a child of 
the path of
-     the cgroup for this process.
-
-     instance_num is used to provide different (sibiling) cgroups for each 
impalad
-     instance. The returned cgroup is created if necessary.
-  """
-  parent_cgroup = os.getenv('IMPALA_CGROUP_BASE_PATH')
-  if parent_cgroup is None:
-    # Join root path with the cpu hierarchy path by concatenting the strings. 
Can't use
-    # path.join() because the session cpu hierarchy path looks like an 
absolute FS path.
-    parent_cgroup = "%s%s" % (get_cpu_controller_root(), 
get_session_cpu_path())
-  cgroup_path = os.path.join(parent_cgroup, ("impala-%s" % instance_num))
-  try:
-    os.makedirs(cgroup_path)
-  except OSError as ex:
-    if ex.errno == errno.EEXIST and os.path.isdir(cgroup_path):
-        pass
-    else: raise
-  return cgroup_path
-
-if __name__ == "__main__":
-  if options.cluster_size < 0:
-    print('Please specify a cluster size >= 0')
-    sys.exit(1)
-  for i in range(options.cluster_size):
-    create_impala_cgroup_path(i)

Reply via email to