[ 
https://issues.apache.org/jira/browse/MESOS-3070?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14682013#comment-14682013
 ] 

Klaus Ma commented on MESOS-3070:
---------------------------------

I'm trying to build UT case by our tools; but failed for now. I append source 
code here, any input is welcome :).


{code}
+// This test ensures that a duplicated in framework will be killed.
+// Refer to MESOS-3070
+TEST_F(MasterTest, KillDuplicatedTaskWhenSlaveReregister)
+{
+  // Step 1: Start Master
+  Try<PID<Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Step 2: Start Slave 1
+  slave::Flags slaveFlags1 = CreateSlaveFlags();
+  MockExecutor exec1(DEFAULT_EXECUTOR_ID);
+
+  Try<PID<Slave>> slave1 = StartSlave(&exec1, slaveFlags1);
+  ASSERT_SOME(slave1);
+
+  // Step 3: Start the scheduler
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  // The call of MockScheduler
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .Times(1);
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillRepeatedly(FutureArg<1>(&offers));
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  TaskID taskId;
+  taskId.set_value("1");
+
+  // Step 4: Start a long running task for slave recover (re-register)
+  const Offer& offer1 = offers.get()[0];
+  TaskInfo task1 = createTask(offer1, "sleep 10000");
+  task1.mutable_task_id()->MergeFrom(taskId);
+
+  driver.launchTasks(offer1.id(), {task1});
+
+  // Step 6: Stop Master/Slave to trigger re-register
+  this->Stop(master.get());
+  this->Stop(slave1.get());
+
+  // Step 7: Re-start Master
+  master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Step 3: Start slave 2
+  offers.discard();
+
+  slave::Flags slaveFlags2 = CreateSlaveFlags();
+  MockExecutor exec2(DEFAULT_EXECUTOR_ID);
+
+  Try<PID<Slave>> slave2 = StartSlave(&exec2, slaveFlags2);
+  ASSERT_SOME(slave2);
+
+  AWAIT_READY(offers);
+
+  const Offer& offer2 = offers.get()[0];
+
+  // Step 9: Submit an new task (with the same id) to the slave
+  TaskInfo task2 = createTask(offer2, "sleep 10000");
+  task1.mutable_task_id()->MergeFrom(taskId);
+
+  driver.launchTasks(offer2.id(), {task2});
+
+  // Step 10: Check the expected status:
+  //   1. The task is killed in executor
+  //   2. The status is updated to the framework
+  EXPECT_CALL(exec1, killTask(_, _))
+    .WillOnce(Return());
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), master.get(), _);
+
+  // Step 11: Re-start the slave to trigger duplicated task ID
+  slave1 = StartSlave(&exec1, slaveFlags1);
+
+  AWAIT_READY(slaveReregisteredMessage);
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_KILLED, status.get().state());
+
+  // Step 12: Stop the driver & UT
+  driver.stop();
+  driver.join();
+
+  Shutdown();
+}
{code}

> Master CHECK failure if a framework uses duplicated task id.
> ------------------------------------------------------------
>
>                 Key: MESOS-3070
>                 URL: https://issues.apache.org/jira/browse/MESOS-3070
>             Project: Mesos
>          Issue Type: Bug
>          Components: master
>    Affects Versions: 0.22.1
>            Reporter: Jie Yu
>            Assignee: Klaus Ma
>
> We observed this in one of our testing cluster.
> One framework (under development) keeps launching tasks using the same 
> task_id. We don't expect the master to crash even if the framework is not 
> doing what it's supposed to do. However, under a series of events, this could 
> happen and keeps crashing the master.
> 1) frameworkA launches task 'task_id_1' on slaveA
> 2) master fails over
> 3) slaveA has not re-registered yet
> 4) frameworkA re-registered and launches task 'task_id_1' on slaveB
> 5) slaveA re-registering and add task "task_id_1' to frameworkA
> 6) CHECK failure in addTask
> {noformat}
> I0716 21:52:50.759305 28805 master.hpp:159] Adding task 'task_id_1' with 
> resources cpus(*):4; mem(*):32768 on slave 
> 20150417-232509-1735470090-5050-48870-S25 (hostname)
> ...
> ...
> F0716 21:52:50.760136 28805 master.hpp:362] Check failed: 
> !tasks.contains(task->task_id()) Duplicate task 'task_id_1' of framework 
> <framework_id>
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to