This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 32e4a404174e8d6b32b23a64569ae44ecd8e4351
Author: Joseph Wu <[email protected]>
AuthorDate: Tue Feb 12 14:06:31 2019 -0800

    Handle possible orphaned operations after master/agent failover.
    
    This is one of two possible code paths which can introduce orphaned
    operations.
    
    When a master failover occurs, all agents and frameworks must
    reregister with the master.  Agents that reregister will report their
    operations with an UpdateSlaveMessage.  Any operations without a
    known framework will be considered orphans.  Known frameworks are
    discovered when the framework reregisters, or an agent running a task
    under the framework reregisters.  The race between these reregistrations
    will be addressed in a separate commit.
    
    Agent failover can also introduce orphans, if the agent has a pending
    operation during failover, and then is migrated to a separate master
    before restarting.  This will be handled the same way as agent
    reregistration after a master failover.
    
    Review: https://reviews.apache.org/r/69961
---
 src/master/master.cpp | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 75e6db7..da970a5 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8232,6 +8232,11 @@ void Master::updateSlave(UpdateSlaveMessage&& message)
              resourceProvider.resource_version_uuid(),
              {}});
 
+        // NOTE: We must add the resource provider's resources to the total
+        // before adding any operations, because orphan operations will
+        // subsequently subtract from this total.
+        slave->totalResources += resourceProvider.total_resources();
+
         hashmap<FrameworkID, Resources> usedByOperations;
 
         foreach (
@@ -8248,8 +8253,8 @@ void Master::updateSlave(UpdateSlaveMessage&& message)
           if (!protobuf::isTerminalState(operation.latest_status().state()) &&
               operation.has_framework_id()) {
             // If we do not yet know the `FrameworkInfo` of the framework the
-            // operation originated from, we cannot properly track the 
operation
-            // at this point.
+            // operation originated from, the operation is an orphan, and
+            // will not be accounted for by the allocator.
             //
             // TODO(bbannier): Consider introducing ways of making sure an 
agent
             // always knows the `FrameworkInfo` of operations triggered on its
@@ -8257,12 +8262,6 @@ void Master::updateSlave(UpdateSlaveMessage&& message)
             // operations like is already done for `RunTaskMessage`, see
             // MESOS-8582.
             if (framework == nullptr) {
-              LOG(WARNING)
-                << "Cannot properly account for operation " << operation.uuid()
-                << " learnt in reconciliation of agent " << slaveId
-                << " since framework " << operation.framework_id()
-                << " is unknown; this can lead to assertion failures after the"
-                   " operation terminates, see MESOS-8536";
               continue;
             }
 
@@ -8278,8 +8277,6 @@ void Master::updateSlave(UpdateSlaveMessage&& message)
           }
         }
 
-        slave->totalResources += resourceProvider.total_resources();
-
         allocator->addResourceProvider(
             slaveId, resourceProvider.total_resources(), usedByOperations);
       } else {
@@ -11513,6 +11510,22 @@ void Master::addOperation(
 
   if (framework != nullptr) {
     framework->addOperation(operation);
+  } else {
+    // When the framework is not known by the master, this means either:
+    //   * The framework has been completed.
+    //   * The framework has no known tasks and has yet to reregister.
+    // The master cannot always differentiate these cases, because completed
+    // frameworks are only kept in memory, in a circular buffer.
+    //
+    // TODO(josephw): Once MESOS-8582 is resolved, operations may include
+    // enough information to add a framework entry, which means only
+    // completed frameworks would result in orphans.
+    //
+    // These operations will be preemptively considered "orphans" and
+    // will be given a grace period before the master adopts them.
+    // After which, the master will acknowledge any associated operation
+    // status updates.
+    slave->markOperationAsOrphan(operation);
   }
 }
 

Reply via email to