This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit aa75504c4b7baaf2c6d4af4829f3ea98982e291e
Author: Joseph Wu <[email protected]>
AuthorDate: Tue Feb 12 11:48:44 2019 -0800

    Added the concept of "orphaned operations" to the master.
    
    An orphaned operation is a non-speculative operation whose
    originating framework is unknown.  These operations will consume
    resources until they are terminated, but will have no entry
    in the allocator because their associated framework does not exist.
    
    To account for resources used by orphaned operations, the operation's
    resources are removed from the agent's total resources upon being
    orphaned.
    
    This commit handles one of the two possible code paths which can
    introduce orphaned operations.
    
    Review: https://reviews.apache.org/r/69960
---
 src/master/master.cpp | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/master/master.hpp | 21 +++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 015da54..75e6db7 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -10645,7 +10645,26 @@ void Master::removeFramework(Framework* framework)
     }
   }
 
+  hashset<Slave*> slavesWithOrphanOperations;
   foreachvalue (Operation* operation, utils::copy(framework->operations)) {
+    // Non-speculative operations are considered "orphaned" once the
+    // originating framework is removed. The resources used by the
+    // operation will remain allocated until a terminal operation
+    // status update is received.
+    if (!protobuf::isSpeculativeOperation(operation->info())) {
+      CHECK(operation->has_slave_id())
+        << "External resource provider is not supported yet";
+
+      Slave* slave = slaves.registered.get(operation->slave_id());
+      CHECK_NOTNULL(slave);
+
+      slave->markOperationAsOrphan(operation);
+
+      // We defer the required updates to the allocator until after the
+      // framework has been removed from the allocator.
+      slavesWithOrphanOperations.insert(slave);
+    }
+
     framework->removeOperation(operation);
   }
 
@@ -10684,10 +10703,28 @@ void Master::removeFramework(Framework* framework)
     }
   }
 
+  // Prevent any allocations from ocurring between the multiple resource
+  // changes below. Removal of a framework removes allocation, while orphan
+  // operations will reduce total resources.
+  allocator->pause();
+
   // Remove the framework.
   frameworks.registered.erase(framework->id());
   allocator->removeFramework(framework->id());
 
+  // For any pending operations, we temporarily remove the operations'
+  // resources from the allocator, because these resources are technically
+  // still in use by the (now removed) framework.
+  foreach (Slave* slave, slavesWithOrphanOperations) {
+    allocator->updateSlave(slave->id, slave->info, slave->totalResources);
+
+    // NOTE: Even though we are modifying the slave's total resources, we
+    // do not need to rescind any offers because the resources removed cannot
+    // be offered between the `removeFramework()` and `updateSlave()` calls.
+  }
+
+  allocator->resume();
+
   // The framework pointer is now owned by `frameworks.completed`.
   frameworks.completed.set(framework->id(), Owned<Framework>(framework));
 
@@ -12772,6 +12809,51 @@ void Slave::removeOperation(Operation* operation)
 }
 
 
+void Slave::markOperationAsOrphan(Operation* operation)
+{
+  // Only non-speculative operations can be orphaned.
+  if (protobuf::isSpeculativeOperation(operation->info())) {
+    return;
+  }
+
+  LOG(INFO) << "Marking operation " << operation->uuid()
+            << (operation->info().has_id()
+                ? " (ID: " + operation->info().id().value() + ")"
+                : "")
+            << (operation->has_slave_id()
+                ? " (Agent: " + operation->slave_id().value() + ")"
+                : "")
+            << (operation->has_framework_id()
+                ? " (Framework: " + operation->framework_id().value() + ")"
+                : "")
+            << " in state " << operation->latest_status().state()
+            << " as an orphan";
+
+  orphanedOperations.insert(operation->uuid());
+
+  // Only non-terminal orphans require additional resource math.
+  if (protobuf::isTerminalState(operation->latest_status().state())) {
+    return;
+  }
+
+  // Orphaned operations have no framework, and hence cannot be accounted
+  // for within the allocator. Instead, the operation's resources are removed
+  // from the agent's total resources until the operation terminates.
+  recoverResources(operation);
+
+  Try<Resources> consumed = protobuf::getConsumedResources(operation->info());
+  CHECK_SOME(consumed);
+
+  Resources consumedUnallocated = consumed.get();
+  consumedUnallocated.unallocate();
+
+  CHECK(totalResources.contains(consumedUnallocated))
+    << "Unknown resources from orphan operation: " << consumedUnallocated;
+
+  totalResources -= consumedUnallocated;
+}
+
+
 Operation* Slave::getOperation(const UUID& uuid) const
 {
   if (operations.contains(uuid)) {
diff --git a/src/master/master.hpp b/src/master/master.hpp
index ccd117f..aceab34 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -153,6 +153,14 @@ Slave(Master* const _master,
 
   void removeOperation(Operation* operation);
 
+  // Marks a non-speculative operation as an orphan when the originating
+  // framework is torn down by the master, or when an agent reregisters
+  // with operations from unknown frameworks. If the operation is
+  // non-terminal, this has the side effect of modifying the agent's
+  // total resources, and should therefore be followed by
+  // `allocator->updateSlave()`.
+  void markOperationAsOrphan(Operation* operation);
+
   Operation* getOperation(const UUID& uuid) const;
 
   void addOffer(Offer* offer);
@@ -248,6 +256,19 @@ Slave(Master* const _master,
   // unacknowledged status updates on this agent.
   hashmap<UUID, Operation*> operations;
 
+  // Pending operations whose originating framework is unknown.
+  // These operations could be pending, or terminal with unacknowledged
+  // status updates.
+  //
+  // This list can be populated whenever a framework is torn down in the
+  // lifetime of the master, or when an agent reregisters with an operation.
+  //
+  // If the originating framework is completed, the master will
+  // acknowledge any status updates instead of the framework.
+  // If an orphan does not belong to a completed framework, the master
+  // will only acknowledge status updates after a fixed delay.
+  hashset<UUID> orphanedOperations;
+
   // Active offers on this slave.
   hashset<Offer*> offers;
 

Reply via email to