This is an automated email from the ASF dual-hosted git repository. josephwu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 32e4a404174e8d6b32b23a64569ae44ecd8e4351 Author: Joseph Wu <[email protected]> AuthorDate: Tue Feb 12 14:06:31 2019 -0800 Handle possible orphaned operations after master/agent failover. This is one of two possible code paths which can introduce orphaned operations. When a master failover occurs, all agents and frameworks must reregister with the master. Agents that reregister will report their operations with an UpdateSlaveMessage. Any operations without a known framework will be considered orphans. Known frameworks are discovered when the framework reregisters, or an agent running a task under the framework reregisters. The race between these reregistrations will be addressed in a separate commit. Agent failover can also introduce orphans, if the agent has a pending operation during failover, and then is migrated to a separate master before restarting. This will be handled the same way as agent reregistration after a master failover. Review: https://reviews.apache.org/r/69961 --- src/master/master.cpp | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 75e6db7..da970a5 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -8232,6 +8232,11 @@ void Master::updateSlave(UpdateSlaveMessage&& message) resourceProvider.resource_version_uuid(), {}}); + // NOTE: We must add the resource provider's resources to the total + // before adding any operations, because orphan operations will + // subsequently subtract from this total. + slave->totalResources += resourceProvider.total_resources(); + hashmap<FrameworkID, Resources> usedByOperations; foreach ( @@ -8248,8 +8253,8 @@ void Master::updateSlave(UpdateSlaveMessage&& message) if (!protobuf::isTerminalState(operation.latest_status().state()) && operation.has_framework_id()) { // If we do not yet know the `FrameworkInfo` of the framework the - // operation originated from, we cannot properly track the operation - // at this point. + // operation originated from, the operation is an orphan, and + // will not be accounted for by the allocator. // // TODO(bbannier): Consider introducing ways of making sure an agent // always knows the `FrameworkInfo` of operations triggered on its @@ -8257,12 +8262,6 @@ void Master::updateSlave(UpdateSlaveMessage&& message) // operations like is already done for `RunTaskMessage`, see // MESOS-8582. if (framework == nullptr) { - LOG(WARNING) - << "Cannot properly account for operation " << operation.uuid() - << " learnt in reconciliation of agent " << slaveId - << " since framework " << operation.framework_id() - << " is unknown; this can lead to assertion failures after the" - " operation terminates, see MESOS-8536"; continue; } @@ -8278,8 +8277,6 @@ void Master::updateSlave(UpdateSlaveMessage&& message) } } - slave->totalResources += resourceProvider.total_resources(); - allocator->addResourceProvider( slaveId, resourceProvider.total_resources(), usedByOperations); } else { @@ -11513,6 +11510,22 @@ void Master::addOperation( if (framework != nullptr) { framework->addOperation(operation); + } else { + // When the framework is not known by the master, this means either: + // * The framework has been completed. + // * The framework has no known tasks and has yet to reregister. + // The master cannot always differentiate these cases, because completed + // frameworks are only kept in memory, in a circular buffer. + // + // TODO(josephw): Once MESOS-8582 is resolved, operations may include + // enough information to add a framework entry, which means only + // completed frameworks would result in orphans. + // + // These operations will be preemptively considered "orphans" and + // will be given a grace period before the master adopts them. + // After which, the master will acknowledge any associated operation + // status updates. + slave->markOperationAsOrphan(operation); } }
