Repository: aurora
Updated Branches:
  refs/heads/master 007212d80 -> 8cf3e3b84


Fix infinite loop in Task State Machine due to TASK_UNKNOWN handling

This patch cleans up the logic. The two main changes:

1) Do not allow ASSIGNED -> PARTITIONED. This is not really related to this 
bug, but I found this logic error during debugging. ASSIGNED is a transient 
state and is subject to the transient task timeout in the Scheduler, so we 
should not attempt to move to PARTITIONED during that window.
2) Do not try to kill tasks we think are terminal when Mesos tells us they are 
unknown. Originally we did this because "manageTerminalTasks" is also used for 
restarting tasks - but in both cases it never makes sense to respond  to "I 
don't know about that task" with a request to kill it.

Bugs closed: AURORA-1966

Reviewed at https://reviews.apache.org/r/65339/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/8cf3e3b8
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/8cf3e3b8
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/8cf3e3b8

Branch: refs/heads/master
Commit: 8cf3e3b840463257c198d98697b04c5fd99b3d21
Parents: 007212d
Author: David McLaughlin <[email protected]>
Authored: Tue Jan 30 14:02:09 2018 -0800
Committer: David McLaughlin <[email protected]>
Committed: Tue Jan 30 14:02:09 2018 -0800

----------------------------------------------------------------------
 .../scheduler/state/TaskStateMachine.java       | 10 +++-------
 .../scheduler/state/TaskStateMachineTest.java   | 21 --------------------
 2 files changed, 3 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java 
b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
index b8ba5da..f325bf4 100644
--- a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
+++ b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
@@ -178,12 +178,8 @@ class TaskStateMachine {
               // order to mark the task as terminal and unblock any operations 
that depend on
               // kills (e.g. job updates). Just sending a KILL signal alone is 
insufficient,
               // as any partitioned task will be on an agent that is 
unreachable.
-              if (transition.getTo() == PARTITIONED) {
-                if (transition.getFrom() == KILLING) {
-                  addFollowup(TRANSITION_TO_LOST);
-                } else {
-                  addFollowup(KILL);
-                }
+              if (transition.getFrom() == KILLING && transition.getTo() == 
PARTITIONED) {
+                addFollowup(TRANSITION_TO_LOST);
               }
             })
             // Kill a task that we believe to be terminated when an attempt is 
made to revive.
@@ -312,7 +308,7 @@ class TaskStateMachine {
         .addState(
             Rule.from(ASSIGNED)
                 .to(STARTING, RUNNING, FINISHED, FAILED, RESTARTING, DRAINING,
-                    KILLED, KILLING, LOST, PREEMPTING, PARTITIONED)
+                    KILLED, KILLING, LOST, PREEMPTING)
                 .withCallback(
                     transition -> {
                       switch (transition.getTo()) {

http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java 
b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
index 3d98fe6..050a46a 100644
--- a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
+++ b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
@@ -404,18 +404,15 @@ public class TaskStateMachineTest {
           .put(new TestCase(false, INIT, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, INIT, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, INIT, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, INIT, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, THROTTLED, PENDING), SAVE)
           .put(new TestCase(true, THROTTLED, KILLING), DELETE_TASK)
           .put(new TestCase(false, THROTTLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, THROTTLED, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, THROTTLED, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, THROTTLED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PENDING, ASSIGNED), SAVE)
           .put(new TestCase(false, PENDING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, PENDING, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, PENDING, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, PENDING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PENDING, KILLING), DELETE_TASK)
           .put(new TestCase(false, ASSIGNED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, ASSIGNED, STARTING), SAVE)
@@ -430,8 +427,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, ASSIGNED, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, ASSIGNED, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, ASSIGNED, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, ASSIGNED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, ASSIGNED, PARTITIONED), SAVE)
           .put(new TestCase(false, STARTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, STARTING, STARTING), ILLEGAL_KILL)
           .put(new TestCase(true, STARTING, RUNNING), SAVE)
@@ -444,7 +439,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, STARTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, STARTING, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, STARTING, LOST), SAVE_AND_RESCHEDULE)
-          .put(new TestCase(false, STARTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, STARTING, PARTITIONED), SAVE)
           .put(new TestCase(false, RUNNING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, RUNNING, STARTING), ILLEGAL_KILL)
@@ -457,7 +451,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, RUNNING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, RUNNING, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, RUNNING, LOST), SAVE_AND_RESCHEDULE)
-          .put(new TestCase(false, RUNNING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, RUNNING, PARTITIONED), SAVE)
           .put(new TestCase(true, FINISHED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, FINISHED, ASSIGNED), ILLEGAL_KILL)
@@ -466,8 +459,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, FINISHED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, FINISHED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, FINISHED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, FINISHED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, FINISHED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, PREEMPTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, STARTING), ILLEGAL_KILL)
@@ -479,7 +470,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, PREEMPTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, PREEMPTING, KILLING), SAVE)
           .put(new TestCase(true, PREEMPTING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, PREEMPTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, RESTARTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, RESTARTING, ASSIGNED), ILLEGAL_KILL)
@@ -492,7 +482,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, RESTARTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, RESTARTING, KILLING), SAVE)
           .put(new TestCase(true, RESTARTING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, RESTARTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, RESTARTING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, DRAINING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, DRAINING, ASSIGNED), ILLEGAL_KILL)
@@ -505,7 +494,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, DRAINING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, DRAINING, KILLING), SAVE)
           .put(new TestCase(true, DRAINING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, DRAINING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, DRAINING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, FAILED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, FAILED, ASSIGNED), ILLEGAL_KILL)
@@ -514,8 +502,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, FAILED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, FAILED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, FAILED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, FAILED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, FAILED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, KILLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, STARTING), ILLEGAL_KILL)
@@ -523,8 +509,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, KILLED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, KILLED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, KILLED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, KILLED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, KILLING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, STARTING), ILLEGAL_KILL)
@@ -536,7 +520,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, KILLING, KILLED), SAVE)
           .put(new TestCase(true, KILLING, LOST), SAVE)
           .put(new TestCase(true, KILLING, DELETED), DELETE_TASK)
-          .put(new TestCase(false, KILLING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(false, PARTITIONED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, PARTITIONED, ASSIGNED), SAVE)
@@ -551,7 +534,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, PARTITIONED, FAILED), RECORD_FAILURE)
           .put(new TestCase(true, PARTITIONED, FINISHED), SAVE)
           .put(new TestCase(true, PARTITIONED, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, PARTITIONED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, LOST, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, STARTING), ILLEGAL_KILL)
@@ -559,12 +541,9 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, LOST, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, LOST, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, DELETED), DELETE_TASK)
-          .put(new TestCase(false, LOST, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, LOST, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, DELETED, PARTITIONED), ILLEGAL_KILL)
           .build();
 
   @Test

Reply via email to