This is an automated email from the ASF dual-hosted git repository. bmahler pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit d821f758e60a5ede3b1146818767f9d1c8c7cb0a Author: Benjamin Mahler <[email protected]> AuthorDate: Mon Apr 15 18:01:16 2024 -0400 Add a regression test for the mitigation of MESOS-7187. --- src/tests/master_slave_reconciliation_tests.cpp | 87 +++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/tests/master_slave_reconciliation_tests.cpp b/src/tests/master_slave_reconciliation_tests.cpp index 9881b9c08..a14846975 100644 --- a/src/tests/master_slave_reconciliation_tests.cpp +++ b/src/tests/master_slave_reconciliation_tests.cpp @@ -32,6 +32,8 @@ #include <process/process.hpp> #include <process/protobuf.hpp> +#include <stout/uuid.hpp> + #include "common/protobuf_utils.hpp" #include "master/master.hpp" @@ -1161,6 +1163,91 @@ TEST_F(MasterSlaveReconciliationTest, SlaveReregisterTaskExecutorIds) driver.join(); } + +// This tests one particular issue that's caused by MESOS-7187 whereby the +// following occurred: +// +// 1. ZK session expired +// 2. Master failover +// 3. Agent run 1 sends re-registration message to new master with +// resource UUID 1. +// 4. Agent fails over (for upgrade) +// 5. Agent run 2 sends re-registration message to new master with +// resource UUID 2. +// 6. Master receives run 1 re-registration message. +// 7. Master ignores run 2 re-registration message +// (as agent is already re-registering). +// 8. Master completes re-registration and stores resource UUID 1 and notifies agent. +// 9. Agent receives re-registration completion, sends resource update with UUID 2. +// 10. Master does not update the agent's resource UUID +// (not because it ignores the update message, but because the logic simply +// wasn't making any update to it), so it remains UUID 1. +// +// Now with the fix we expect the master to have updated the UUID, although +// note that the fix isn't a complete fix for MESOS-7187, rather just fixes +// the resource UUID mismatch. +TEST_F(MasterSlaveReconciliationTest, SlaveReregistrationRace_MESOS_7187) +{ + Try<Owned<cluster::Master>> master = StartMaster(); + ASSERT_SOME(master); + + // Start an agent, but intercept the resource update message so that + // we can inject a new resource UUID and ensure the master stores it. + Future<UpdateSlaveMessage> updateSlaveMessage = + DROP_PROTOBUF(UpdateSlaveMessage(), _, _); + + slave::Flags flags = CreateSlaveFlags(); + StandaloneMasterDetector detector(master.get()->pid); + Try<Owned<cluster::Slave>> slave = StartSlave(&detector, flags); + ASSERT_SOME(slave); + + AWAIT_READY(updateSlaveMessage); + + // Now send the update with a tweaked resource UUID. + id::UUID uuid = id::UUID::random(); + UpdateSlaveMessage updateCopy = updateSlaveMessage.get(); + updateCopy.mutable_resource_version_uuid()->set_value(uuid.toBytes()); + process::post(slave.get()->pid, master.get()->pid, updateCopy); + + // Now launch a task and check that the resource version is our + // mutated version. + MockScheduler sched; + MesosSchedulerDriver driver( + &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); + + Future<FrameworkID> frameworkId; + EXPECT_CALL(sched, registered(&driver, _, _)) + .WillOnce(FutureArg<1>(&frameworkId)); + + Future<vector<Offer>> offers; + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(frameworkId); + + AWAIT_READY(offers); + EXPECT_NE(0u, offers->size()); + + Future<RunTaskMessage> runTaskMessage = + DROP_PROTOBUF(RunTaskMessage(), _, _); + + const Offer& offer = offers->front(); + TaskInfo task = createTask(offer, "echo hi"); + driver.launchTasks(offer.id(), {task}); + + AWAIT_READY(runTaskMessage); + + ASSERT_EQ(1, runTaskMessage->resource_version_uuids_size()); + EXPECT_FALSE( + runTaskMessage->resource_version_uuids(0).has_resource_provider_id()); + EXPECT_EQ( + uuid.toBytes(), runTaskMessage->resource_version_uuids(0).uuid().value()); +} + + } // namespace tests { } // namespace internal { } // namespace mesos {
