----------------------------------------------------------- This is an automatically generated e-mail. To reply, visit: https://reviews.apache.org/r/55571/#review161827 -----------------------------------------------------------
src/master/master.cpp (lines 7201 - 7203) <https://reviews.apache.org/r/55571/#comment233128> There may be some corner cases that causes this `CHECK` to coredump. For example: 1. Start a Mesos cluster with master, agent and a scheduler with single role `role1`. 2. Master failover 3. Agent re-register before scheduler does 4. Scheduler re-subscribe with multi-role `{role2}` In this case, master re-construct FrameworkInfo from re-registered agent, which contains old role. However, in current implementation, `Error` of `updateFrameworkInfo` is not caught for recovered frameworks. In theory, this test case shouldn't cause master coredump: ```cpp master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); StandaloneMasterDetector slaveDetector(master.get()->pid); Try<Owned<cluster::Slave>> slave = StartSlave(&slaveDetector, &containerizer); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_failover_timeout(1024); frameworkInfo.set_role("role1"); Future<FrameworkID> frameworkId; { MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); Clock::pause(); Clock::settle(); Clock::resume(); AWAIT_READY(frameworkId); AWAIT_READY(offers); EXPECT_FALSE(offers->empty()); TaskInfo task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); EXPECT_CALL(exec, registered(_, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> runningStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&runningStatus)); driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(runningStatus); EXPECT_EQ(TASK_RUNNING, runningStatus.get().state()); EXPECT_EQ(task.task_id(), runningStatus.get().task_id()); } slaveDetector.appoint(None()); master->reset(); master = StartMaster(masterFlags); ASSERT_SOME(master); Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); slaveDetector.appoint(master.get()->pid); AWAIT_READY(slaveReregisteredMessage); frameworkInfo.mutable_id()->CopyFrom(frameworkId.get()); frameworkInfo.add_roles("role2"); frameworkInfo.clear_role(); frameworkInfo.add_capabilities()->set_type( FrameworkInfo::Capability::MULTI_ROLE); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); Future<Nothing> registered; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureSatisfy(®istered)); driver.start(); Clock::pause(); Clock::settle(); Clock::resume(); AWAIT_READY(registered); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); ``` - Jay Guo On Jan. 16, 2017, 7:28 p.m., Benjamin Bannier wrote: > > ----------------------------------------------------------- > This is an automatically generated e-mail. To reply, visit: > https://reviews.apache.org/r/55571/ > ----------------------------------------------------------- > > (Updated Jan. 16, 2017, 7:28 p.m.) > > > Review request for mesos, Benjamin Mahler, Jay Guo, and Guangya Liu. > > > Bugs: MESOS-6631 > https://issues.apache.org/jira/browse/MESOS-6631 > > > Repository: mesos > > > Description > ------- > > This allows us to confirm that an updated FrameworkInfo is fully > compatible with an already known one. By performing this check in > updateFrameworkInfo when can be sure that both the new and old > FrameworkInfo are available (e.g., after framework reregistration, or > master or framework failover). > > > Diffs > ----- > > src/master/master.hpp 44f4fecb1fbe8bebf830990a59a5462338e6e004 > src/master/master.cpp b863ff6e93931c3d1ee056248084c7f44caf2fd9 > > Diff: https://reviews.apache.org/r/55571/diff/ > > > Testing > ------- > > Tested as part of https://reviews.apache.org/r/55271/. > > > Thanks, > > Benjamin Bannier > >
