Sent CheckpointResourcesMessage only when reregister with an old master. No need for sending checkpoint resources message to the agent if the master does not have state about the agent.
Review: https://reviews.apache.org/r/62878 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e9ac9f82 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e9ac9f82 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e9ac9f82 Branch: refs/heads/master Commit: e9ac9f8252a1aa01d72fb7b918bf723df8e7dd7b Parents: 5b83b31 Author: Jie Yu <[email protected]> Authored: Tue Oct 10 17:43:28 2017 -0700 Committer: Jie Yu <[email protected]> Committed: Sun Oct 29 15:57:28 2017 +0100 ---------------------------------------------------------------------- src/master/master.cpp | 79 ++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/e9ac9f82/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 4b76648..c118b9d 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -6313,11 +6313,47 @@ void Master::_reregisterSlave( CHECK(slave->active) << "Unexpected connected but deactivated agent " << *slave; - // Inform the agent of the master's version of its checkpointed - // resources and the new framework pids for its tasks. + // Inform the agent of the new framework pids for its tasks. ___reregisterSlave(slave, frameworks); slaves.reregistering.erase(slaveInfo.id()); + + // Send checkpointed resources to the agent. This is important for + // the cases where the master didn't fail over. In that case, the + // master might have already applied an operation that the agent + // didn't see (e.g., due to a breaking connection). This message + // will sync the state between the master and the agent about + // checkpointed resources. + CheckpointResourcesMessage message; + + message.mutable_resources()->CopyFrom(slave->checkpointedResources); + + if (!slave->capabilities.reservationRefinement) { + // If the agent is not refinement-capable, don't send it + // checkpointed resources that contain refined reservations. This + // might occur if a reservation refinement is created but never + // reaches the agent (e.g., due to network partition), and then + // the agent is downgraded before the partition heals. + // + // TODO(neilc): It would probably be better to prevent the agent + // from re-registering in this scenario. + Try<Nothing> result = downgradeResources(message.mutable_resources()); + if (result.isError()) { + LOG(WARNING) << "Not sending updated checkpointed resouces " + << slave->checkpointedResources + << " with refined reservations, since agent " << *slave + << " is not RESERVATION_REFINEMENT-capable."; + + return; + } + } + + LOG(INFO) << "Sending updated checkpointed resources " + << slave->checkpointedResources + << " to agent " << *slave; + + send(slave->pid, message); + return; } @@ -6657,45 +6693,6 @@ void Master::___reregisterSlave( recoverFramework(frameworkInfo, {}); } } - - // Send checkpointed resources to the agent. This is important for - // the cases where the master didn't fail over. In that case, the - // master might have already applied an operation that the agent - // didn't see (e.g., due to a breaking connection). This message - // will sync the state between the master and the agent about - // checkpointed resources. - // - // TODO(jieyu): This message is not necessary for the case where the - // master fails over. Consider moving this to `reconcileKnownSlave`. - CheckpointResourcesMessage message; - - message.mutable_resources()->CopyFrom(slave->checkpointedResources); - - if (!slave->capabilities.reservationRefinement) { - // If the agent is not refinement-capable, don't send it - // checkpointed resources that contain refined reservations. This - // might occur if a reservation refinement is created but never - // reaches the agent (e.g., due to network partition), and then - // the agent is downgraded before the partition heals. - // - // TODO(neilc): It would probably be better to prevent the agent - // from re-registering in this scenario. - Try<Nothing> result = downgradeResources(message.mutable_resources()); - if (result.isError()) { - LOG(WARNING) << "Not sending updated checkpointed resouces " - << slave->checkpointedResources - << " with refined reservations, since agent " << *slave - << " is not RESERVATION_REFINEMENT-capable."; - - return; - } - } - - LOG(INFO) << "Sending updated checkpointed resources " - << slave->checkpointedResources - << " to agent " << *slave; - - send(slave->pid, message); }
