This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit ad612599df9e866b2a622baa4cdb659ece5c4574 Author: Greg Mann <[email protected]> AuthorDate: Mon Apr 6 15:16:33 2020 -0700 Added agent-side validation for resource limits. This prevents tasks from being launched on agents which would not be capable of enforcing the specified limits. Review: https://reviews.apache.org/r/72297/ --- src/slave/slave.cpp | 64 +++++++++++++++++++++++++++++++++-- src/slave/slave.hpp | 3 ++ src/tests/master_tests.cpp | 10 +++++- src/tests/master_validation_tests.cpp | 20 +++++++++-- 4 files changed, 91 insertions(+), 6 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 6a48023..1a32c81 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -2168,6 +2168,34 @@ void Slave::runTask( } +Option<Error> Slave::validateResourceLimitsAndIsolators( + const vector<TaskInfo>& tasks) +{ + foreach (const TaskInfo& task, tasks) { + if (!(task.has_container() && + task.container().type() == ContainerInfo::DOCKER)) { + if (task.limits().count("cpus") && + !(strings::contains(flags.isolation, "cgroups/cpu") || + strings::contains(flags.isolation, "cgroups/all"))) { + return Error( + "CPU limits can only be set on tasks launched in Mesos containers" + " when the agent has loaded the 'cgroups/cpu' isolator"); + } + + if (task.limits().count("mem") && + !(strings::contains(flags.isolation, "cgroups/mem") || + strings::contains(flags.isolation, "cgroups/all"))) { + return Error( + "Memory limits can only be set on tasks launched in Mesos" + " containers when the agent has loaded the 'cgroups/mem' isolator"); + } + } + } + + return None(); +} + + void Slave::run( const FrameworkInfo& frameworkInfo, ExecutorInfo executorInfo, @@ -2320,6 +2348,40 @@ void Slave::run( } } + CHECK_NOTNULL(framework); + + Option<Error> error = validateResourceLimitsAndIsolators(tasks); + if (error.isSome()) { + // We report TASK_DROPPED to the framework because the task was + // never launched. For non-partition-aware frameworks, we report + // TASK_LOST for backward compatibility. + mesos::TaskState taskState = TASK_DROPPED; + if (!protobuf::frameworkHasCapability( + frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) { + taskState = TASK_LOST; + } + + foreach (const TaskInfo& _task, tasks) { + const StatusUpdate update = protobuf::createStatusUpdate( + frameworkId, + info.id(), + _task.task_id(), + taskState, + TaskStatus::SOURCE_SLAVE, + id::UUID::random(), + error->message, + TaskStatus::REASON_GC_ERROR); + + statusUpdate(update, UPID()); + } + + if (framework->idle()) { + removeFramework(framework); + } + + return; + } + const ExecutorID& executorId = executorInfo.executor_id(); if (HookManager::hooksAvailable()) { @@ -2342,8 +2404,6 @@ void Slave::run( } } - CHECK_NOTNULL(framework); - // Track the pending task / task group to ensure the framework is // not removed and the framework and top level executor directories // are not scheduled for deletion before '_run()' is called. diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp index d7e65e0..2cf45c6 100644 --- a/src/slave/slave.hpp +++ b/src/slave/slave.hpp @@ -158,6 +158,9 @@ public: const process::UPID& from, RunTaskMessage&& runTaskMessage); + Option<Error> validateResourceLimitsAndIsolators( + const std::vector<TaskInfo>& tasks); + // Made 'virtual' for Slave mocking. virtual void runTask( const process::UPID& from, diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 8cc8d20..785e5d5 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -4330,7 +4330,15 @@ TEST_F(MasterTest, TasksEndpoint) TestContainerizer containerizer(&exec); Owned<MasterDetector> detector = master.get()->createDetector(); - Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); + + // We must enable the CPU and memory isolators on the agent so that it can + // accept resource limits. + slave::Flags flags = CreateSlaveFlags(); + flags.isolation = "cgroups/cpu,cgroups/mem"; + + Try<Owned<cluster::Slave>> slave = + StartSlave(detector.get(), &containerizer, flags); + ASSERT_SOME(slave); MockScheduler sched; diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp index 9efca42..816635a 100644 --- a/src/tests/master_validation_tests.cpp +++ b/src/tests/master_validation_tests.cpp @@ -5036,7 +5036,13 @@ TEST_F(TaskGroupValidationTest, ResourceLimits) AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); - // Valid, no error. + // Erase the memory limit so we can be sure what error message to look for + // in the next test case. + task1.mutable_limits()->erase("mem"); + task2.mutable_limits()->erase("mem"); + + // Error: the agent does not have any isolators loaded, so it can't enforce + // resource limits and does not launch the task group. { TaskGroupInfo taskGroup; taskGroup.add_tasks()->CopyFrom(task1); @@ -5061,10 +5067,18 @@ TEST_F(TaskGroupValidationTest, ResourceLimits) driver.acceptOffers({offers->at(0).id()}, {operation}, filters); AWAIT_READY(task1Status); - EXPECT_EQ(TASK_STARTING, task1Status->state()); + EXPECT_EQ(TASK_LOST, task1Status->state()); + EXPECT_EQ( + "CPU limits can only be set on tasks launched in Mesos containers" + " when the agent has loaded the 'cgroups/cpu' isolator", + task1Status->message()); AWAIT_READY(task2Status); - EXPECT_EQ(TASK_STARTING, task2Status->state()); + EXPECT_EQ(TASK_LOST, task2Status->state()); + EXPECT_EQ( + "CPU limits can only be set on tasks launched in Mesos containers" + " when the agent has loaded the 'cgroups/cpu' isolator", + task2Status->message()); } }
