This is an automated email from the ASF dual-hosted git repository. mzhu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 5849c4af5fc702d9bb366da44f58ae93a2712f3a Author: Meng Zhu <[email protected]> AuthorDate: Thu Sep 12 11:46:23 2019 -0700 Simplified recover resources when removing frameworks or agents. Review: https://reviews.apache.org/r/71476 --- src/master/allocator/mesos/hierarchical.cpp | 85 ++++++++++++++++------------- src/master/allocator/mesos/hierarchical.hpp | 8 +++ src/master/master.cpp | 12 ---- 3 files changed, 55 insertions(+), 50 deletions(-) diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp index 62750cc..4728154 100644 --- a/src/master/allocator/mesos/hierarchical.cpp +++ b/src/master/allocator/mesos/hierarchical.cpp @@ -675,6 +675,15 @@ void HierarchicalAllocatorProcess::removeFramework( { CHECK(initialized); + // Free up resources on agents if any. + foreachvalue (Slave& slave, slaves) { + slave.increaseAvailable( + frameworkId, + slave.getOfferedOrAllocated().get(frameworkId).getOrElse(Resources())); + } + + // Update tracking in the role tree and sorters. + Framework& framework = *CHECK_NOTNONE(getFramework(frameworkId)); foreach (const string& role, framework.roles) { @@ -926,11 +935,13 @@ void HierarchicalAllocatorProcess::removeSlave( { const Slave& slave = *CHECK_NOTNONE(getSlave(slaveId)); - // TODO(bmahler): Per MESOS-621, this should remove the allocations - // that any frameworks have on this slave. Otherwise the caller may - // "leak" allocated resources accidentally if they forget to recover - // all the resources. Fixing this would require more information - // than what we currently track in the allocator. + // Untrack resources in roleTree and sorter. + foreachpair ( + const FrameworkID& frameworkId, + const Resources& resources, + slave.getOfferedOrAllocated()) { + untrackAllocatedResources(slaveId, frameworkId, resources); + } roleSorter->remove(slaveId, slave.getTotal()); @@ -1459,6 +1470,22 @@ void HierarchicalAllocatorProcess::recoverResources( return; } + Option<Framework*> framework = getFramework(frameworkId); + Option<Slave*> slave = getSlave(slaveId); + + // No work to do if either the framework or the agent no longer exists. + // + // The framework may not exist if we dispatched Master::offer before we + // received MesosAllocatorProcess::removeFramework or + // MesosAllocatorProcess::deactivateFramework, in which case we will + // have already recovered all of its resources). + // + // The agent may not exist if we dispatched Master::offer before we + // received `removeSlave`. + if (framework.isNone() || slave.isNone()) { + return; + } + // For now, we require that resources are recovered within a single // allocation role (since filtering in the same manner across roles // seems undesirable). @@ -1472,41 +1499,26 @@ void HierarchicalAllocatorProcess::recoverResources( string role = allocations.begin()->first; - // Updated resources allocated to framework (if framework still - // exists, which it might not in the event that we dispatched - // Master::offer before we received - // MesosAllocatorProcess::removeFramework or - // MesosAllocatorProcess::deactivateFramework, in which case we will - // have already recovered all of its resources). - Option<Framework*> framework = getFramework(frameworkId); + // Update resources on the agent. - if (framework.isSome()) { - Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role)); + CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources)) + << "agent " << slaveId << " resources " + << (*slave)->getTotalOfferedOrAllocated() << " do not contain " + << resources; - if (frameworkSorter->contains(frameworkId.value())) { - untrackAllocatedResources(slaveId, frameworkId, resources); - } - } + (*slave)->increaseAvailable(frameworkId, resources); - // Update resources allocated on slave (if slave still exists, - // which it might not in the event that we dispatched Master::offer - // before we received Allocator::removeSlave). - Option<Slave*> slave = getSlave(slaveId); + VLOG(1) << "Recovered " << resources << " (total: " << (*slave)->getTotal() + << ", offered or allocated: " + << (*slave)->getTotalOfferedOrAllocated() << ")" + << " on agent " << slaveId << " from framework " << frameworkId; - if (slave.isSome()) { - CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources)) - << "agent " << slaveId << " resources " - << (*slave)->getTotalOfferedOrAllocated() << " do not contain " - << resources; + // Update role tree and sorter. - (*slave)->increaseAvailable(frameworkId, resources); + Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role)); - VLOG(1) << "Recovered " << resources - << " (total: " << (*slave)->getTotal() - << ", offered or allocated: " - << (*slave)->getTotalOfferedOrAllocated() << ")" - << " on agent " << slaveId - << " from framework " << frameworkId; + if (frameworkSorter->contains(frameworkId.value())) { + untrackAllocatedResources(slaveId, frameworkId, resources); } // No need to install the filter if 'filters' is none. @@ -1514,10 +1526,7 @@ void HierarchicalAllocatorProcess::recoverResources( return; } - // No need to install the filter if slave/framework does not exist. - if (framework.isNone() || slave.isNone()) { - return; - } + // Update filters. // Create a refused resources filter. Try<Duration> timeout = Duration::create(Filters().refuse_seconds()); diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp index 36aa2fc..d42124f 100644 --- a/src/master/allocator/mesos/hierarchical.hpp +++ b/src/master/allocator/mesos/hierarchical.hpp @@ -321,8 +321,12 @@ public: const FrameworkID& frameworkId, const Resources& offeredOrAllocated_) { // Increasing available is to subtract offered or allocated. + if (offeredOrAllocated_.empty()) { + return; + } Resources& resources = offeredOrAllocated.at(frameworkId); + CHECK_CONTAINS(resources, offeredOrAllocated_); resources -= offeredOrAllocated_; if (resources.empty()) { offeredOrAllocated.erase(frameworkId); @@ -336,6 +340,10 @@ public: void decreaseAvailable( const FrameworkID& frameworkId, const Resources& offeredOrAllocated_) { + if (offeredOrAllocated_.empty()) { + return; + } + // Decreasing available is to add offered or allocated. offeredOrAllocated[frameworkId] += offeredOrAllocated_; diff --git a/src/master/master.cpp b/src/master/master.cpp index e5bc493..65994aa 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -11640,12 +11640,6 @@ void Master::_removeSlave( // We want to remove the slave first, to avoid the allocator // re-allocating the recovered resources. - // - // NOTE: Removing the slave is not sufficient for recovering the - // resources in the allocator, because the "Sorters" are updated - // only within recoverResources() (see MESOS-621). The calls to - // recoverResources() below are therefore required, even though - // the slave is already removed. allocator->removeSlave(slave->id); // Transition the tasks to lost and remove them. @@ -11774,12 +11768,6 @@ void Master::__removeSlave( { // We want to remove the slave first, to avoid the allocator // re-allocating the recovered resources. - // - // NOTE: Removing the slave is not sufficient for recovering the - // resources in the allocator, because the "Sorters" are updated - // only within recoverResources() (see MESOS-621). The calls to - // recoverResources() below are therefore required, even though - // the slave is already removed. allocator->removeSlave(slave->id); // Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST
