Repository: mesos
Updated Branches:
  refs/heads/master 3f6d2ac4a -> 51a3bd95b


Removed the inaccurate `slaves.transitioning` function in the master.

This function used to return whether the agent was transitioning
between states, however it was updated in e510813f93e253480005ce
to return only when the agent is recovered.

This patch removes the now confusing function in favor of just
directly checking the state we care about in the reconciliation
logic. Since there were no other usages, the function is removed.

Review: https://reviews.apache.org/r/64968


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/51a3bd95
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/51a3bd95
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/51a3bd95

Branch: refs/heads/master
Commit: 51a3bd95bd2d740a39b55634251abeadb561e5c8
Parents: 3f6d2ac
Author: Benjamin Mahler <[email protected]>
Authored: Thu Jan 4 17:29:58 2018 -0800
Committer: Benjamin Mahler <[email protected]>
Committed: Tue Jan 9 18:47:49 2018 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 30 +++++++++++++++++++++---------
 src/master/master.hpp |  9 ---------
 2 files changed, 21 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/51a3bd95/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 1610f43..6fc5de8 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8449,13 +8449,18 @@ void Master::_reconcileTasks(
   // Explicit reconciliation occurs for the following cases:
   //   (1) Task is known, but pending: TASK_STAGING.
   //   (2) Task is known: send the latest state.
-  //   (3) Task is unknown, slave is registered: TASK_GONE.
-  //   (4) Task is unknown, slave is transitioning: no-op.
+  //   (3) Task is unknown, slave is recovered: no-op.
+  //   (4) Task is unknown, slave is registered: TASK_GONE.
   //   (5) Task is unknown, slave is unreachable: TASK_UNREACHABLE.
   //   (6) Task is unknown, slave is gone: TASK_GONE_BY_OPERATOR.
   //   (7) Task is unknown, slave is unknown: TASK_UNKNOWN.
   //
-  // For cases (3), (5), (6) and (7) TASK_LOST is sent instead if the
+  // For case (3), if the slave ID is not provided, we err on the
+  // side of caution and do not reply if there are *any* recovered
+  // slaves that haven't re-registered, since the task could reside
+  // on one of these slaves.
+  //
+  // For cases (4), (5), (6) and (7) TASK_LOST is sent instead if the
   // framework has not opted-in to the PARTITION_AWARE capability.
   foreach (const TaskStatus& status, statuses) {
     Option<SlaveID> slaveId = None();
@@ -8502,8 +8507,20 @@ void Master::_reconcileTasks(
           protobuf::getTaskCheckStatus(*task),
           None(),
           protobuf::getTaskContainerStatus(*task));
+    } else if ((slaveId.isSome() && slaves.recovered.contains(slaveId.get())) 
||
+               (slaveId.isNone() && !slaves.recovered.empty())) {
+      // (3) Task is unknown, slave is recovered: no-op. The framework
+      // will have to retry this and will not receive a response until
+      // the agent either registers, or is marked unreachable after the
+      // timeout.
+      LOG(INFO) << "Dropping reconciliation of task " << status.task_id()
+                << " for framework " << *framework << " because "
+                << (slaveId.isSome() ?
+                      "agent " + stringify(slaveId.get()) + " has" :
+                      "some agents have")
+                << " not yet re-registered with the master";
     } else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) {
-      // (3) Task is unknown, slave is registered: TASK_GONE. If the
+      // (4) Task is unknown, slave is registered: TASK_GONE. If the
       // framework does not have the PARTITION_AWARE capability, send
       // TASK_LOST for backward compatibility.
       TaskState taskState = TASK_GONE;
@@ -8520,11 +8537,6 @@ void Master::_reconcileTasks(
           None(),
           "Reconciliation: Task is unknown to the agent",
           TaskStatus::REASON_RECONCILIATION);
-    } else if (slaves.transitioning(slaveId)) {
-      // (4) Task is unknown, slave is transitionary: no-op.
-      LOG(INFO) << "Dropping reconciliation of task " << status.task_id()
-                << " for framework " << *framework
-                << " because there are transitional agents";
     } else if (slaveId.isSome() && slaves.unreachable.contains(slaveId.get())) 
{
       // (5) Slave is unreachable: TASK_UNREACHABLE. If the framework
       // does not have the PARTITION_AWARE capability, send TASK_LOST

http://git-wip-us.apache.org/repos/asf/mesos/blob/51a3bd95/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 130f6e2..5e6ba53 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1910,15 +1910,6 @@ private:
     // NOTE: Using a 'shared_ptr' here is OK because 'RateLimiter' is
     // a wrapper around libprocess process which is thread safe.
     Option<std::shared_ptr<process::RateLimiter>> limiter;
-
-    bool transitioning(const Option<SlaveID>& slaveId)
-    {
-      if (slaveId.isSome()) {
-        return recovered.contains(slaveId.get());
-      } else {
-        return !recovered.empty();
-      }
-    }
   } slaves;
 
   struct Frameworks

Reply via email to