[mesos] 01/03: Changed Agent V1 GET_STATE for any completed executor's tasks.

josephwu Tue, 28 May 2019 15:47:52 -0700

This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


commit c7a9b418a159d3efda88ac6d26da2b305c490749
Author: Joseph Wu <[email protected]>
AuthorDate: Tue Apr 30 19:06:26 2019 -0700

    Changed Agent V1 GET_STATE for any completed executor's tasks.
    
    It is possible for a completed executor to have a non-terminal task
    (based on last status update).  For example, during graceful shutdown
    of an agent, graceful shutdown of an executor will race with the
    agent's shutdown grace period.  If the executor does not send a
    TASK_KILLED in time, the agent will still mark the executor as complete
    and kill it.
    
    After agent recovery, these completed executors will show up in an
    agent's /state and GET_STATE responses.  In GET_STATE however, any
    non-terminal tasks will appear under `launched_tasks`.  This may
    provide misleading information about the total number of tasks running.
    
    This commit adds extra logic to place these non-terminal tasks under
    the `terminated_tasks` category, and adds a regression test.
    
    Review: https://reviews.apache.org/r/70577
---
 src/slave/slave.cpp     |  12 +++++
 src/tests/api_tests.cpp | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 95f05a1..e966f7c 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -10024,6 +10024,18 @@ void Framework::recoverExecutor(
           executor->containerId,
           defaultExecutorTasks));
 
+    // Make sure there are no "launched tasks" on a recovered completed
+    // executor. We can only encounter these non-terminal terminated tasks
+    // when recovering a checkpointed executor that is missing a terminal
+    // status update. See MESOS-9750 for a one way to enter this state.
+    foreachpair (
+        const TaskID& taskId,
+        Task* task,
+        utils::copy(executor->launchedTasks)) {
+      executor->launchedTasks.erase(taskId);
+      executor->terminatedTasks[taskId] = task;
+    }
+
     // GC the executor run's meta directory.
     slave->garbageCollect(paths::getExecutorRunPath(
         slave->metaDir, slave->info.id(), id(), state.id, runId));
diff --git a/src/tests/api_tests.cpp b/src/tests/api_tests.cpp
index e76417a..e21681d 100644
--- a/src/tests/api_tests.cpp
+++ b/src/tests/api_tests.cpp
@@ -6451,6 +6451,129 @@ TEST_P_TEMP_DISABLED_ON_WINDOWS(AgentAPITest, GetState)
 }
 
 
+// Checks that the V1 GET_STATE API will correctly categorize a non-terminal
+// task as a completed task, if the task belongs to a completed executor.
+TEST_P_TEMP_DISABLED_ON_WINDOWS(
+    AgentAPITest, GetStateWithNonTerminalCompletedTask)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  // Remove this delay so that the agent will immediately kill any tasks.
+  slaveFlags.executor_shutdown_grace_period = Seconds(0);
+
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), &containerizer, slaveFlags);
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(_, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  Future<TaskStatus> statusRunning;
+  Future<TaskStatus> statusLost;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&statusRunning))
+    .WillOnce(FutureArg<1>(&statusLost));
+
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .Times(AtMost(1));
+
+  EXPECT_CALL(exec, registered(_, _, _, _));
+
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+  const Offer& offer = offers.get()[0];
+
+  TaskInfo task = createTask(offer, "", DEFAULT_EXECUTOR_ID);
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(statusRunning);
+  EXPECT_EQ(TASK_RUNNING, statusRunning->state());
+
+  // Emulate the checkpointed state of an executor where the following occurs:
+  //   1) A graceful shutdown is initiated on the agent (i.e. SIGUSR1).
+  //   2) The executor is sent a kill, and starts killing its tasks.
+  //   3) The executor exits, before all terminal status updates reach the
+  //      agent. This results in a completed executor, with non-terminal tasks.
+  //
+  // A simple way to reach this state is to shutdown the agent and prevent
+  // the executor from sending the appropriate terminal status update.
+  Future<Nothing> shutdown;
+  EXPECT_CALL(exec, shutdown(_))
+    .WillOnce(FutureSatisfy(&shutdown));
+
+  slave.get()->shutdown();
+
+  AWAIT_READY(shutdown);
+  AWAIT_READY(statusLost);
+
+  // Start the agent back up, and allow it to recover the "completed" executor.
+  Future<Nothing> __recover = FUTURE_DISPATCH(_, &Slave::__recover);
+
+  slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(__recover);
+
+  driver.stop();
+  driver.join();
+
+  ContentType contentType = GetParam();
+
+  // Non-terminal tasks on completed executors should appear as terminated
+  // tasks, even if they do not have a terminal status update.
+  {
+    v1::agent::Call v1Call;
+    v1Call.set_type(v1::agent::Call::GET_STATE);
+
+    Future<v1::agent::Response> v1Response =
+      post(slave.get()->pid, v1Call, contentType);
+
+    AWAIT_READY(v1Response);
+    ASSERT_TRUE(v1Response->IsInitialized());
+    ASSERT_EQ(v1::agent::Response::GET_STATE, v1Response->type());
+
+    const v1::agent::Response::GetState& getState = v1Response->get_state();
+    EXPECT_TRUE(getState.get_frameworks().frameworks().empty());
+    EXPECT_EQ(1, getState.get_frameworks().completed_frameworks_size());
+    EXPECT_TRUE(getState.get_tasks().launched_tasks().empty());
+    ASSERT_EQ(1, getState.get_tasks().terminated_tasks_size());
+    EXPECT_TRUE(getState.get_executors().executors().empty());
+    EXPECT_EQ(1, getState.get_executors().completed_executors_size());
+
+    // The latest state of this terminated task will not be terminal,
+    // because the executor was not given the chance to send the update.
+    EXPECT_EQ(
+        v1::TASK_RUNNING, getState.get_tasks().terminated_tasks(0).state());
+  }
+}
+
+
 // This test verifies that launch nested container session fails when
 // attaching to the output of the container fails. Consequently, the
 // launched container should be destroyed.

[mesos] 01/03: Changed Agent V1 GET_STATE for any completed executor's tasks.

Reply via email to