Ignored agent registrations with duplicate agent IDs.

If an agent attempts to register and is assigned an agent ID that
already appears in the registry, the likely cause is a UUID collision:
slave IDs are prefixed with master IDs, and master IDs are randomly
generated UUIDs. UUID collisions are extremely unlikely, so in practice
this should never happen.

This commit updates the comments to clarify how unlikely this situation
is. Furthermore, when it does happen, the master now just ignores the
registration attempt, rather than trying to shutdown the agent. This is
simpler; the agent will eventually try to register again and be assigned
a new agent ID. Finally, it is unclear that shutting down the agent is
actually appropriate: the previous coding added the duplicate ID to
`slaves.removed`, which will interfere with the activity of the other
slave whose ID we collided with.

Review: https://reviews.apache.org/r/51913/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d9d6f7cc
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d9d6f7cc
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d9d6f7cc

Branch: refs/heads/master
Commit: d9d6f7ccf4dbd3826a687fc1c348a8e2f952ed58
Parents: aae8196
Author: Neil Conway <neil.con...@gmail.com>
Authored: Mon Sep 19 15:49:07 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Mon Sep 19 15:49:07 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 72 +++++++++++++++++++++++-----------------------
 src/master/master.hpp |  5 +++-
 2 files changed, 40 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d9d6f7cc/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e4499b5..20ccd8b 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5079,50 +5079,50 @@ void Master::_registerSlave(
   if (admit.isFailed()) {
     LOG(FATAL) << "Failed to admit agent " << slaveInfo.id() << " at " << pid
                << " (" << slaveInfo.hostname() << "): " << admit.failure();
-  } else if (!admit.get()) {
-    // This means the slave is already present in the registrar, it's
-    // likely we generated a duplicate slave id!
+  }
+
+  if (!admit.get()) {
+    // This should only happen if there is a slaveID collision, but that
+    // is extremely unlikely in practice: slaveIDs are prefixed with the
+    // master ID, which is a randomly generated UUID. In this situation,
+    // we ignore the registration attempt. The slave will eventually try
+    // to register again and be assigned a new slave ID.
     LOG(ERROR) << "Agent " << slaveInfo.id() << " at " << pid
-               << " (" << slaveInfo.hostname() << ") was not admitted, "
-               << "asking to shut down";
-    slaves.removed.put(slaveInfo.id(), Nothing());
+               << " (" << slaveInfo.hostname() << ") was assigned"
+               << " an agent ID that already appears in the registry;"
+               << " ignoring registration attempt";
+    return;
+  }
 
-    ShutdownMessage message;
-    message.set_message(
-        "Agent attempted to register but got duplicate agent id " +
-        stringify(slaveInfo.id()));
-    send(pid, message);
-  } else {
-    MachineID machineId;
-    machineId.set_hostname(slaveInfo.hostname());
-    machineId.set_ip(stringify(pid.address.ip));
+  MachineID machineId;
+  machineId.set_hostname(slaveInfo.hostname());
+  machineId.set_ip(stringify(pid.address.ip));
 
-    Slave* slave = new Slave(
-        this,
-        slaveInfo,
-        pid,
-        machineId,
-        version,
-        Clock::now(),
-        checkpointedResources);
+  Slave* slave = new Slave(
+      this,
+      slaveInfo,
+      pid,
+      machineId,
+      version,
+      Clock::now(),
+      checkpointedResources);
 
-    ++metrics->slave_registrations;
+  ++metrics->slave_registrations;
 
-    addSlave(slave);
+  addSlave(slave);
 
-    Duration pingTimeout =
-      flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
-    MasterSlaveConnection connection;
-    connection.set_total_ping_timeout_seconds(pingTimeout.secs());
+  Duration pingTimeout =
+    flags.agent_ping_timeout * flags.max_agent_ping_timeouts;
+  MasterSlaveConnection connection;
+  connection.set_total_ping_timeout_seconds(pingTimeout.secs());
 
-    SlaveRegisteredMessage message;
-    message.mutable_slave_id()->CopyFrom(slave->id);
-    message.mutable_connection()->CopyFrom(connection);
-    send(slave->pid, message);
+  SlaveRegisteredMessage message;
+  message.mutable_slave_id()->CopyFrom(slave->id);
+  message.mutable_connection()->CopyFrom(connection);
+  send(slave->pid, message);
 
-    LOG(INFO) << "Registered agent " << *slave
-              << " with " << slave->info.resources();
-  }
+  LOG(INFO) << "Registered agent " << *slave
+            << " with " << slave->info.resources();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/d9d6f7cc/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 4ba8e53..dbcaa3d 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1934,7 +1934,10 @@ protected:
       hashset<SlaveID>* slaveIDs,
       bool strict)
   {
-    // Check and see if this slave is currently admitted.
+    // Check if this slave is currently admitted. This should only
+    // happen if there is a slaveID collision, but that is extremely
+    // unlikely in practice: slaveIDs are prefixed with the master ID,
+    // which is a randomly generated UUID.
     if (slaveIDs->contains(info.id())) {
       if (strict) {
         return Error("Agent already admitted");

Reply via email to