Added a master flag to disallow agents without domain. Added a new `--require_agent_domain` flag and implementation. When set to true, it will cause the master to refuse (re-)registration attempts for agents with no configured domain.
This is intended as a safety net for operators, who could forget to configure the fault domain of a remote agent and let it join the cluster. If this happens, an agent in a remote region will be considered a local agent by the master and frameworks (because agent's fault domain is not configured), causing tasks to potentially land in a remote agent which is undesirable. Review: https://reviews.apache.org/r/64507/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/79d18926 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/79d18926 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/79d18926 Branch: refs/heads/1.5.x Commit: 79d189260eb705c73fd7fa356cef6e28351cb490 Parents: 4bc5f03 Author: Benno Evers <bev...@mesosphere.com> Authored: Tue Jan 2 10:58:35 2018 -0800 Committer: Vinod Kone <vinodk...@gmail.com> Committed: Tue Jan 2 11:45:33 2018 -0800 ---------------------------------------------------------------------- docs/configuration/master.md | 8 ++++ src/master/flags.cpp | 5 +++ src/master/flags.hpp | 1 + src/master/master.cpp | 24 ++++++++++ src/tests/master_tests.cpp | 93 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/docs/configuration/master.md ---------------------------------------------------------------------- diff --git a/docs/configuration/master.md b/docs/configuration/master.md index 6af16a0..3ff97ea 100644 --- a/docs/configuration/master.md +++ b/docs/configuration/master.md @@ -487,6 +487,14 @@ after which the operation is considered a failure. (default: 20secs) </tr> <tr> <td> + --[no-]require_agent_domain + </td> + <td> +If true, only agents with a configured domain can register. (default: false) + </td> +</tr> +<tr> + <td> --roles=VALUE </td> <td> http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/flags.cpp ---------------------------------------------------------------------- diff --git a/src/master/flags.cpp b/src/master/flags.cpp index 18f405b..b49cb63 100644 --- a/src/master/flags.cpp +++ b/src/master/flags.cpp @@ -642,6 +642,11 @@ mesos::internal::master::Flags::Flags() "the IP address which the master will try to bind to.\n" "Cannot be used in conjunction with `--ip`."); + add(&Flags::require_agent_domain, + "require_agent_domain", + "If true, only agents with a configured domain can register.\n", + false); + add(&Flags::domain, "domain", "Domain that the master belongs to. Mesos currently only supports\n" http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/flags.hpp ---------------------------------------------------------------------- diff --git a/src/master/flags.hpp b/src/master/flags.hpp index edda71a..dabb414 100644 --- a/src/master/flags.hpp +++ b/src/master/flags.hpp @@ -96,6 +96,7 @@ public: Duration registry_gc_interval; Duration registry_max_agent_age; size_t registry_max_agent_count; + bool require_agent_domain; Option<DomainInfo> domain; // The following flags are executable specific (e.g., since we only http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 4528687..03eb178 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -6207,6 +6207,18 @@ void Master::_registerSlave( return; } + // Don't allow agents without domain if domains are required. + // We don't shutdown the agent to allow it to restart itself with + // the correct domain and without losing tasks. + if (flags.require_agent_domain && !slaveInfo.has_domain()) { + LOG(WARNING) << "Agent at " << pid << " attempted to register without " + << "a domain, but this master is configured to require agent " + << "domains. Ignoring agent registration attempt"; + + slaves.registering.erase(pid); + return; + } + // Check if this slave is already registered (because it retries). if (Slave* slave = slaves.registered.get(pid)) { if (!slave->connected) { @@ -6578,6 +6590,18 @@ void Master::_reregisterSlave( return; } + // Don't allow agents without domain if domains are required. + // We don't shutdown the agent to allow it to restart itself with + // the correct domain and without losing tasks. + if (flags.require_agent_domain && !slaveInfo.has_domain()) { + LOG(WARNING) << "Agent at " << pid << " attempted to register without " + << "a domain, but this master is configured to require agent " + << "domains. Ignoring agent re-registration attempt"; + + slaves.reregistering.erase(slaveInfo.id()); + return; + } + if (Slave* slave = slaves.registered.get(slaveInfo.id())) { CHECK(!slaves.recovered.contains(slaveInfo.id())); http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/tests/master_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 3f9e5f4..5546fd9 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -8222,6 +8222,99 @@ TEST_F(MasterTest, AgentDomainMismatch) } +// This test checks that if the `--require_agent_domain` flag is set and +// the agent does not have a domain configured, the registration attempt +// will fail. +TEST_F(MasterTest, RegistrationWithoutRequiredAgentDomain) +{ + Clock::pause(); + + master::Flags masterFlags = CreateMasterFlags(); + masterFlags.domain = createDomainInfo("region-abc", "zone-456"); + masterFlags.require_agent_domain = true; + + Try<Owned<cluster::Master>> master = StartMaster(masterFlags); + ASSERT_SOME(master); + + // Agent should attempt to register. + Future<RegisterSlaveMessage> registerSlaveMessage = + FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _); + + // If the agent is allowed to register, the master will update the + // registry. The agent should not be allowed to register, so we + // expect that no registrar operations will be observed. + EXPECT_CALL(*master.get()->registrar, apply(_)) + .Times(0); + + Owned<MasterDetector> detector = master.get()->createDetector(); + slave::Flags slaveFlags = CreateSlaveFlags(); + Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); + ASSERT_SOME(slave); + + Clock::advance(slaveFlags.registration_backoff_factor); + Clock::settle(); + + AWAIT_READY(registerSlaveMessage); +} + + +// This test checks that if the `--require_agent_domain` flag is set and +// the agent does not have a domain configured when trying to re-register, +// the re-registration attempt will fail. +TEST_F(MasterTest, ReregistrationWithoutRequiredAgentDomain) +{ + Clock::pause(); + + master::Flags masterFlags = CreateMasterFlags(); + masterFlags.domain = createDomainInfo("region-abc", "zone-456"); + masterFlags.require_agent_domain = true; + + Try<Owned<cluster::Master>> master = StartMaster(masterFlags); + ASSERT_SOME(master); + + slave::Flags slaveFlags = CreateSlaveFlags(); + slaveFlags.domain = createDomainInfo("region-abc", "zone-456"); + + // Agent should successfully register. + Future<SlaveRegisteredMessage> slaveRegisteredMessage = + FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); + + Owned<MasterDetector> detector = master.get()->createDetector(); + Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); + + Clock::advance(slaveFlags.registration_backoff_factor); + + AWAIT_READY(slaveRegisteredMessage); + + // Shut down slave and re-register without domain. We do this by + // intercepting the `ReregisterSlaveMessage` and erasing the domain + // before passing it on to the master. + Future<ReregisterSlaveMessage> reregisterSlaveMessage = + DROP_PROTOBUF(ReregisterSlaveMessage(), _, _); + + slave.get()->shutdown(); + slave = StartSlave(detector.get(), slaveFlags); + ASSERT_SOME(slave); + + Clock::advance(slaveFlags.authentication_backoff_factor); + Clock::advance(slaveFlags.registration_backoff_factor); + + AWAIT_READY(reregisterSlaveMessage); + + ReregisterSlaveMessage message = reregisterSlaveMessage.get(); + message.mutable_slave()->clear_domain(); + + // If the agent is allowed to register, the master will update the + // registry. The agent should not be allowed to register, so we + // expect that no registrar operations will be observed. + EXPECT_CALL(*master.get()->registrar, apply(_)) + .Times(0); + + process::post(slave.get()->pid, master.get()->pid, message); + Clock::settle(); +} + + // This test checks that if the agent is configured with a domain but // the master is not, the agent is not allowed to re-register. This // might happen if the leading master is configured with a domain but