This is an automated email from the ASF dual-hosted git repository. qianzhang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 8d51df87b058144d0ce51008e393b6261b6e9765 Author: Qian Zhang <[email protected]> AuthorDate: Thu Jan 2 09:05:43 2020 +0800 Set container process's OOM score adjust. Review: https://reviews.apache.org/r/71944 --- .../mesos/isolators/cgroups/cgroups.cpp | 8 +- .../mesos/isolators/cgroups/subsystem.cpp | 9 ++- .../mesos/isolators/cgroups/subsystem.hpp | 7 +- .../mesos/isolators/cgroups/subsystems/devices.cpp | 3 +- .../mesos/isolators/cgroups/subsystems/devices.hpp | 3 +- .../mesos/isolators/cgroups/subsystems/memory.cpp | 87 +++++++++++++++++++++- .../mesos/isolators/cgroups/subsystems/memory.hpp | 14 +++- .../mesos/isolators/cgroups/subsystems/net_cls.cpp | 3 +- .../mesos/isolators/cgroups/subsystems/net_cls.hpp | 3 +- .../isolators/cgroups/subsystems/perf_event.cpp | 3 +- .../isolators/cgroups/subsystems/perf_event.hpp | 3 +- src/slave/containerizer/mesos/utils.cpp | 20 +++++ src/slave/containerizer/mesos/utils.hpp | 3 + 13 files changed, 148 insertions(+), 18 deletions(-) diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp index 8e858f4..4193538 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp @@ -484,7 +484,8 @@ Future<Option<ContainerLaunchInfo>> CgroupsIsolatorProcess::prepare( infos[containerId]->subsystems.insert(subsystem->name()); prepares.push_back(subsystem->prepare( containerId, - infos[containerId]->cgroup)); + infos[containerId]->cgroup, + containerConfig)); } // Chown the cgroup so the executor or a nested container whose @@ -705,10 +706,7 @@ Future<Nothing> CgroupsIsolatorProcess::isolate( // containers with shared cgroups, because we don't call `prepare()`, // `recover()`, or `cleanup()` on them either. If we were to call // `isolate()` on them, the call would likely fail because the subsystem - // doesn't know about the container. This is currently OK because - // the only cgroup isolator that even implements `isolate()` is the - // `NetClsSubsystem` and it doesn't do anything with the `pid` - // passed in. + // doesn't know about the container. // // TODO(klueska): In the future we should revisit this to make // sure that doing things this way is sufficient (or otherwise diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp index d9c8fa7..6393bee 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp @@ -116,13 +116,15 @@ Future<Nothing> Subsystem::recover( Future<Nothing> Subsystem::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) { return process::dispatch( process.get(), &SubsystemProcess::prepare, containerId, - cgroup); + cgroup, + containerConfig); } @@ -221,7 +223,8 @@ Future<Nothing> SubsystemProcess::recover( Future<Nothing> SubsystemProcess::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) { return Nothing(); } diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp index 088d417..7d33901 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp @@ -90,11 +90,13 @@ public: * * @param containerId The target containerId. * @param cgroup The target cgroup. + * @param containerConfig The container configuration. * @return Nothing or an error if `prepare` fails. */ process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup); + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig); /** * Isolate the associated container to cgroups subsystem. @@ -198,7 +200,8 @@ public: virtual process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup); + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig); virtual process::Future<Nothing> isolate( const ContainerID& containerId, diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp index ac2e66b..d1de13a 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp @@ -167,7 +167,8 @@ Future<Nothing> DevicesSubsystemProcess::recover( Future<Nothing> DevicesSubsystemProcess::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) { if (containerIds.contains(containerId)) { return Failure("The subsystem '" + name() + "' has already been prepared"); diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp index c62deec..8c34c80 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp @@ -55,7 +55,8 @@ public: process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup) override; + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) override; process::Future<Nothing> recover( const ContainerID& containerId, diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp index 4102985..15f87ba 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp @@ -32,11 +32,14 @@ #include "common/protobuf_utils.hpp" +#include "slave/containerizer/mesos/utils.hpp" + #include "slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp" using cgroups::memory::pressure::Counter; using cgroups::memory::pressure::Level; +using mesos::slave::ContainerConfig; using mesos::slave::ContainerLimitation; using process::Failure; @@ -135,7 +138,8 @@ Future<Nothing> MemorySubsystemProcess::recover( Future<Nothing> MemorySubsystemProcess::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const ContainerConfig& containerConfig) { if (infos.contains(containerId)) { return Failure("The subsystem '" + name() + "' has already been prepared"); @@ -143,6 +147,7 @@ Future<Nothing> MemorySubsystemProcess::prepare( infos.put(containerId, Owned<Info>(new Info)); infos[containerId]->hardLimitUpdated = false; + infos[containerId]->isCommandTask = containerConfig.has_task_info(); oomListen(containerId, cgroup); pressureListen(containerId, cgroup); @@ -151,6 +156,86 @@ Future<Nothing> MemorySubsystemProcess::prepare( } +Future<Nothing> MemorySubsystemProcess::isolate( + const ContainerID& containerId, + const string& cgroup, + pid_t pid) +{ + if (!infos.contains(containerId)) { + return Failure( + "Failed to isolate subsystem '" + name() + "'" + ": Unknown container"); + } + + // Get the soft limit. + Try<Bytes> softLimit = + cgroups::memory::soft_limit_in_bytes(hierarchy, cgroup); + + if (softLimit.isError()) { + return Failure( + "Failed to read 'memory.soft_limit_in_bytes'" + ": " + softLimit.error()); + } + + // Get the hard limit. + Try<Bytes> hardLimit = + cgroups::memory::limit_in_bytes(hierarchy, cgroup); + + if (hardLimit.isError()) { + return Failure( + "Failed to read 'memory.limit_in_bytes'" + ": " + hardLimit.error()); + } + + // While the OOM score of a process is a complex function of the process state + // and configuration, a decent approximation of the OOM score is 10 x percent + // of memory used by the process + `/proc/$pid/oom_score_adj` (a configurable + // quantity which is between -1000 and 1000). Containers with higher OOM + // scores are killed if the system runs out of memory. + // + // We would like burstable task containers which consume more memory than + // their memory requests (i.e. soft limits) to be preferentially OOM-killed + // first. To accomplish this, we set their OOM score adjustment as shown + // below, which attempts to ensure that the container which consumes more + // memory than its memory request will have an OOM score of 1000. + // + // Please note that there are two kinds of burstable task containers: + // 1. Command task containers whose soft limit < hard limit. + // 2. Nested task containers whose soft limit < hard limit. + // + // For any other kinds of containers (see below), we will just leave their OOM + // score adjustments at the default value (i.e. 0). + // 1. Containers whose soft limit == hard limit, this is to ensure backward + // compatibility. + // 2. Default executor containers whose soft limit < hard limit. + // 3. Custom executor containers whose soft limit < hard limit. + // 4. Debug containers. + if (softLimit.get() < hardLimit.get() && + (infos[containerId]->isCommandTask || containerId.has_parent())) { + Try<int> oomScoreAdj = calculateOOMScoreAdj(softLimit.get()); + if (oomScoreAdj.isError()) { + return Failure( + "Failed to calculate OOM score adjustment: " + oomScoreAdj.error()); + } + + const string oomScoreAdjPath = + strings::format("/proc/%d/oom_score_adj", pid).get(); + + Try<Nothing> write = + os::write(oomScoreAdjPath, stringify(oomScoreAdj.get())); + + if (write.isError()) { + return Failure("Failed to set OOM score adjustment: " + write.error()); + } + + LOG(INFO) << "Set " << oomScoreAdjPath << " to " << oomScoreAdj.get() + << " for container " << containerId; + } + + return Nothing(); +} + + Future<ContainerLimitation> MemorySubsystemProcess::watch( const ContainerID& containerId, const string& cgroup) diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp index ed34df8..a4bbef8 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp @@ -57,7 +57,13 @@ public: process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup) override; + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) override; + + process::Future<Nothing> isolate( + const ContainerID& containerId, + const std::string& cgroup, + pid_t pid) override; process::Future<Nothing> recover( const ContainerID& containerId, @@ -97,6 +103,12 @@ private: // Indicate whether the memory hard limit of this container has // already been updated. bool hardLimitUpdated; + + // Indicates whether this is a command task container. Please note + // that we only need to use this field in isolating phase, so we do + // not recover it after agent restarts, that means its value may not + // be correct after agent recovery. + bool isCommandTask; }; MemorySubsystemProcess(const Flags& flags, const std::string& hierarchy); diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp index ec2ce67..e140194 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp @@ -336,7 +336,8 @@ Future<Nothing> NetClsSubsystemProcess::recover( Future<Nothing> NetClsSubsystemProcess::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) { if (infos.contains(containerId)) { return Failure("The subsystem '" + name() + "' has already been prepared"); diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp index 0653107..c604a2b 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp @@ -150,7 +150,8 @@ public: process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup) override; + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) override; process::Future<Nothing> isolate( const ContainerID& containerId, diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp index 180afc9..e88eea4 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp @@ -115,7 +115,8 @@ Future<Nothing> PerfEventSubsystemProcess::recover( Future<Nothing> PerfEventSubsystemProcess::prepare( const ContainerID& containerId, - const string& cgroup) + const string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) { if (infos.contains(containerId)) { return Failure("The subsystem '" + name() + "' has already been prepared"); diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp index 2c865ac..cac04fe 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp @@ -57,7 +57,8 @@ public: process::Future<Nothing> prepare( const ContainerID& containerId, - const std::string& cgroup) override; + const std::string& cgroup, + const mesos::slave::ContainerConfig& containerConfig) override; process::Future<Nothing> recover( const ContainerID& containerId, diff --git a/src/slave/containerizer/mesos/utils.cpp b/src/slave/containerizer/mesos/utils.cpp index d9964f0..970aa59 100644 --- a/src/slave/containerizer/mesos/utils.cpp +++ b/src/slave/containerizer/mesos/utils.cpp @@ -140,6 +140,26 @@ Try<pid_t> getMountNamespaceTarget(pid_t parent) } #endif // __linux__ + +Try<int> calculateOOMScoreAdj(const Bytes& memRequest) +{ + // Get the total memory of this node. + static Option<Bytes> totalMem; + if (totalMem.isNone()) { + Try<os::Memory> mem = os::memory(); + if (mem.isError()) { + return Error( + "Failed to auto-detect the size of main memory: " + mem.error()); + } + + totalMem = mem->total; + } + + CHECK_SOME(totalMem); + + return 1000 - (1000 * memRequest.bytes()) / totalMem->bytes(); +} + } // namespace slave { } // namespace internal { } // namespace mesos { diff --git a/src/slave/containerizer/mesos/utils.hpp b/src/slave/containerizer/mesos/utils.hpp index bfd07e2..4a31dfd 100644 --- a/src/slave/containerizer/mesos/utils.hpp +++ b/src/slave/containerizer/mesos/utils.hpp @@ -30,6 +30,9 @@ namespace slave { Try<pid_t> getMountNamespaceTarget(pid_t parent); #endif // __linux__ + +Try<int> calculateOOMScoreAdj(const Bytes& memRequest); + } // namespace slave { } // namespace internal { } // namespace mesos {
