This is an automated email from the ASF dual-hosted git repository. mzhu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 038a4d3bfdb1b2e75dac12e2f1a611ba99b97cec Author: Meng Zhu <[email protected]> AuthorDate: Wed Jul 17 18:46:49 2019 -0700 Added offer rescind logic for guarantees enforcement. Outstanding offers need to be rescinded as needed to ensure roles' requested guarantees can be satisfied. Note, the rescind effort here is best-effort. It is complex and expensive to rescind accurately, due to (1) the cost of tracking the correct resource state (e.g. for limits, tracking of the precise amount of consumed plus offered (with no reservation overlap, and similarly, for guarantees, aggregation of all roles' consumption and outstanding offers) (2) the race between the master and the allocator. In addition, rescinding offers for quota is mostly about improving a transient state. Once a quota is set, hopefully with resource churn, the quota will eventually be enforced. Lastly, once Mesos starts to adopt an optimistic offer model (MESOS-1607), quota enforcement will happen during admission control, rendering offer rescind unnecessary. As a result, we cut some corners here to only make best effort rescinding. Specifically, for guarantee enforcement, we pessimistically assume that what seems like "available" resources in the allocator are all gone. We greedily rescind offers until rescinded plus sum of current consumed and offered exceed the total guarantees. Also since rescinded resources and consumed and offered may overlap (when rescinded contains reservations that are also part of the consumed), it is possible that we may under rescind. On the other hand, we also pessimistically assume that there is no available resources in the cluster. So chances are we are more likely to over than under rescind. Also added a test. Review: https://reviews.apache.org/r/71111 --- src/master/quota_handler.cpp | 42 ++++++++++++++++++++++- src/tests/master_quota_tests.cpp | 72 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/src/master/quota_handler.cpp b/src/master/quota_handler.cpp index d1be0ab..1748fb9 100644 --- a/src/master/quota_handler.cpp +++ b/src/master/quota_handler.cpp @@ -651,7 +651,47 @@ Future<http::Response> Master::QuotaHandler::_update( } } - // TODO(mzhu): Rescind offers to satisfy guarantees. + // We then rescind offers to ensure roles' guarantees + // can be satisfied. + ResourceQuantities guarantees{config.guarantees()}; + ResourceQuantities rescinded; + + // Following the best-effort approach mentioned above, we + // pessimistically assume that what seems like "available" resources + // in the allocator are all gone. We greedily rescind offers until + // rescinded plus sum of current `consumedAndOffered` exceed the + // total guarantees. + // + // Since `rescinded` and `consumedAndOffered` may overlap (when + // rescinded contains reservations that are also part of the + // consumed), it is possible that we may under rescind. + // On the other hand, we also pessimistically assume that + // there is no available resources in the cluster. So chances + // are we are more likely to over than under rescind. + foreachvalue (const Slave* slave, master->slaves.registered) { + if ((rescinded + consumedAndOffered).contains(guarantees)) { + break; + } + + foreach (Offer* offer, utils::copy(slave->offers)) { + if ((rescinded + consumedAndOffered).contains(guarantees)) { + break; + } + + if (allocatedToRoleSubtree(*offer)) { + continue; + } + + rescinded += ResourceQuantities::fromResources(offer->resources()); + + master->allocator->recoverResources( + offer->framework_id(), + offer->slave_id(), + offer->resources(), + None()); + master->removeOffer(offer, true); + } + } } return OK(); diff --git a/src/tests/master_quota_tests.cpp b/src/tests/master_quota_tests.cpp index 93f989f..c0775f4 100644 --- a/src/tests/master_quota_tests.cpp +++ b/src/tests/master_quota_tests.cpp @@ -1685,6 +1685,78 @@ TEST_F(MasterQuotaTest, AvailableResourcesAfterRescinding) } +// This test ensures that outstanding offers are rescinded as needed to satisfy +// roles' quota gurantees. +TEST_F(MasterQuotaTest, RescindOffersForUpdateQuotaGuarantees) +{ + TestAllocator<> allocator; + EXPECT_CALL(allocator, initialize(_, _, _)); + + Try<Owned<cluster::Master>> master = StartMaster(&allocator); + ASSERT_SOME(master); + + // Start an agent. + slave::Flags flags1 = CreateSlaveFlags(); + flags1.resources = "cpus:1;mem:1024"; + Owned<MasterDetector> detector = master.get()->createDetector(); + + Try<Owned<cluster::Slave>> slave1 = StartSlave(detector.get(), flags1); + ASSERT_SOME(slave1); + + // Start a framework under `ROLE1`. + FrameworkInfo frameworkInfo1 = DEFAULT_FRAMEWORK_INFO; + frameworkInfo1.set_roles(0, ROLE1); + + MockScheduler sched1; + MesosSchedulerDriver framework1( + &sched1, frameworkInfo1, master.get()->pid, DEFAULT_CREDENTIAL); + + Future<FrameworkID> frameworkId1; + EXPECT_CALL(sched1, registered(&framework1, _, _)) + .WillOnce(FutureArg<1>(&frameworkId1)); + + Future<vector<Offer>> offers1; + EXPECT_CALL(sched1, resourceOffers(&framework1, _)) + .WillOnce(FutureArg<1>(&offers1)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + Future<Nothing> offerRescinded1; + EXPECT_CALL(sched1, offerRescinded(&framework1, _)) + .WillOnce(FutureSatisfy(&offerRescinded1)); + + framework1.start(); + + AWAIT_READY(offers1); + ASSERT_EQ(1u, offers1->size()); + + // Cluster resources: cpus:1;mem:1024 + // Offered: `ROLE1` cpus:1;mem:1024 + + // Set `ROLE2` quota guarantees to be the cluster resources. + // Outstanding offers are rescinded. + { + process::http::Headers headers = createBasicAuthHeaders(DEFAULT_CREDENTIAL); + headers["Content-Type"] = "application/json"; + + Future<Response> response = process::http::post( + master.get()->pid, + "/api/v1", + headers, + createUpdateQuotaRequestBody( + createQuotaConfig(ROLE2, "cpus:1;mem:1024", ""))); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); + + AWAIT_READY(offerRescinded1); + } + + // Tear down frameworks before agents to avoid offers being + // rescinded again. + framework1.stop(); + framework1.join(); +} + + // This tests verifies the offer rescind logic for quota limits enforcement. // If a role's quota consumption plus offered are above the requested limits, // outstanding offers of that role will be rescinded.
