Repository: aurora Updated Branches: refs/heads/master 007212d80 -> 8cf3e3b84
Fix infinite loop in Task State Machine due to TASK_UNKNOWN handling This patch cleans up the logic. The two main changes: 1) Do not allow ASSIGNED -> PARTITIONED. This is not really related to this bug, but I found this logic error during debugging. ASSIGNED is a transient state and is subject to the transient task timeout in the Scheduler, so we should not attempt to move to PARTITIONED during that window. 2) Do not try to kill tasks we think are terminal when Mesos tells us they are unknown. Originally we did this because "manageTerminalTasks" is also used for restarting tasks - but in both cases it never makes sense to respond to "I don't know about that task" with a request to kill it. Bugs closed: AURORA-1966 Reviewed at https://reviews.apache.org/r/65339/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/8cf3e3b8 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/8cf3e3b8 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/8cf3e3b8 Branch: refs/heads/master Commit: 8cf3e3b840463257c198d98697b04c5fd99b3d21 Parents: 007212d Author: David McLaughlin <[email protected]> Authored: Tue Jan 30 14:02:09 2018 -0800 Committer: David McLaughlin <[email protected]> Committed: Tue Jan 30 14:02:09 2018 -0800 ---------------------------------------------------------------------- .../scheduler/state/TaskStateMachine.java | 10 +++------- .../scheduler/state/TaskStateMachineTest.java | 21 -------------------- 2 files changed, 3 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java index b8ba5da..f325bf4 100644 --- a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java +++ b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java @@ -178,12 +178,8 @@ class TaskStateMachine { // order to mark the task as terminal and unblock any operations that depend on // kills (e.g. job updates). Just sending a KILL signal alone is insufficient, // as any partitioned task will be on an agent that is unreachable. - if (transition.getTo() == PARTITIONED) { - if (transition.getFrom() == KILLING) { - addFollowup(TRANSITION_TO_LOST); - } else { - addFollowup(KILL); - } + if (transition.getFrom() == KILLING && transition.getTo() == PARTITIONED) { + addFollowup(TRANSITION_TO_LOST); } }) // Kill a task that we believe to be terminated when an attempt is made to revive. @@ -312,7 +308,7 @@ class TaskStateMachine { .addState( Rule.from(ASSIGNED) .to(STARTING, RUNNING, FINISHED, FAILED, RESTARTING, DRAINING, - KILLED, KILLING, LOST, PREEMPTING, PARTITIONED) + KILLED, KILLING, LOST, PREEMPTING) .withCallback( transition -> { switch (transition.getTo()) { http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java index 3d98fe6..050a46a 100644 --- a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java +++ b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java @@ -404,18 +404,15 @@ public class TaskStateMachineTest { .put(new TestCase(false, INIT, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, INIT, STARTING), ILLEGAL_KILL) .put(new TestCase(false, INIT, RUNNING), ILLEGAL_KILL) - .put(new TestCase(false, INIT, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, THROTTLED, PENDING), SAVE) .put(new TestCase(true, THROTTLED, KILLING), DELETE_TASK) .put(new TestCase(false, THROTTLED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, THROTTLED, STARTING), ILLEGAL_KILL) .put(new TestCase(false, THROTTLED, RUNNING), ILLEGAL_KILL) - .put(new TestCase(false, THROTTLED, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, PENDING, ASSIGNED), SAVE) .put(new TestCase(false, PENDING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, PENDING, STARTING), ILLEGAL_KILL) .put(new TestCase(false, PENDING, RUNNING), ILLEGAL_KILL) - .put(new TestCase(false, PENDING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, PENDING, KILLING), DELETE_TASK) .put(new TestCase(false, ASSIGNED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, ASSIGNED, STARTING), SAVE) @@ -430,8 +427,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, ASSIGNED, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, ASSIGNED, KILLING), SAVE_AND_KILL) .put(new TestCase(true, ASSIGNED, LOST), SAVE_KILL_AND_RESCHEDULE) - .put(new TestCase(false, ASSIGNED, PARTITIONED), ILLEGAL_KILL) - .put(new TestCase(true, ASSIGNED, PARTITIONED), SAVE) .put(new TestCase(false, STARTING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, STARTING, STARTING), ILLEGAL_KILL) .put(new TestCase(true, STARTING, RUNNING), SAVE) @@ -444,7 +439,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, STARTING, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, STARTING, KILLING), SAVE_AND_KILL) .put(new TestCase(true, STARTING, LOST), SAVE_AND_RESCHEDULE) - .put(new TestCase(false, STARTING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, STARTING, PARTITIONED), SAVE) .put(new TestCase(false, RUNNING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, RUNNING, STARTING), ILLEGAL_KILL) @@ -457,7 +451,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, RUNNING, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, RUNNING, KILLING), SAVE_AND_KILL) .put(new TestCase(true, RUNNING, LOST), SAVE_AND_RESCHEDULE) - .put(new TestCase(false, RUNNING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, RUNNING, PARTITIONED), SAVE) .put(new TestCase(true, FINISHED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, FINISHED, ASSIGNED), ILLEGAL_KILL) @@ -466,8 +459,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, FINISHED, RUNNING), ILLEGAL_KILL) .put(new TestCase(false, FINISHED, RUNNING), ILLEGAL_KILL) .put(new TestCase(true, FINISHED, DELETED), DELETE_TASK) - .put(new TestCase(false, FINISHED, PARTITIONED), ILLEGAL_KILL) - .put(new TestCase(true, FINISHED, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, PREEMPTING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, PREEMPTING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, PREEMPTING, STARTING), ILLEGAL_KILL) @@ -479,7 +470,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, PREEMPTING, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, PREEMPTING, KILLING), SAVE) .put(new TestCase(true, PREEMPTING, LOST), SAVE_KILL_AND_RESCHEDULE) - .put(new TestCase(false, PREEMPTING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, PREEMPTING, PARTITIONED), TRANSITION_TO_LOST) .put(new TestCase(true, RESTARTING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, RESTARTING, ASSIGNED), ILLEGAL_KILL) @@ -492,7 +482,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, RESTARTING, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, RESTARTING, KILLING), SAVE) .put(new TestCase(true, RESTARTING, LOST), SAVE_KILL_AND_RESCHEDULE) - .put(new TestCase(false, RESTARTING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, RESTARTING, PARTITIONED), TRANSITION_TO_LOST) .put(new TestCase(true, DRAINING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, DRAINING, ASSIGNED), ILLEGAL_KILL) @@ -505,7 +494,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, DRAINING, KILLED), SAVE_AND_RESCHEDULE) .put(new TestCase(true, DRAINING, KILLING), SAVE) .put(new TestCase(true, DRAINING, LOST), SAVE_KILL_AND_RESCHEDULE) - .put(new TestCase(false, DRAINING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, DRAINING, PARTITIONED), TRANSITION_TO_LOST) .put(new TestCase(true, FAILED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, FAILED, ASSIGNED), ILLEGAL_KILL) @@ -514,8 +502,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, FAILED, RUNNING), ILLEGAL_KILL) .put(new TestCase(false, FAILED, RUNNING), ILLEGAL_KILL) .put(new TestCase(true, FAILED, DELETED), DELETE_TASK) - .put(new TestCase(false, FAILED, PARTITIONED), ILLEGAL_KILL) - .put(new TestCase(true, FAILED, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, KILLED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, KILLED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, KILLED, STARTING), ILLEGAL_KILL) @@ -523,8 +509,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, KILLED, RUNNING), ILLEGAL_KILL) .put(new TestCase(false, KILLED, RUNNING), ILLEGAL_KILL) .put(new TestCase(true, KILLED, DELETED), DELETE_TASK) - .put(new TestCase(false, KILLED, PARTITIONED), ILLEGAL_KILL) - .put(new TestCase(true, KILLED, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, KILLING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, KILLING, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, KILLING, STARTING), ILLEGAL_KILL) @@ -536,7 +520,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, KILLING, KILLED), SAVE) .put(new TestCase(true, KILLING, LOST), SAVE) .put(new TestCase(true, KILLING, DELETED), DELETE_TASK) - .put(new TestCase(false, KILLING, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, KILLING, PARTITIONED), TRANSITION_TO_LOST) .put(new TestCase(false, PARTITIONED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, PARTITIONED, ASSIGNED), SAVE) @@ -551,7 +534,6 @@ public class TaskStateMachineTest { .put(new TestCase(true, PARTITIONED, FAILED), RECORD_FAILURE) .put(new TestCase(true, PARTITIONED, FINISHED), SAVE) .put(new TestCase(true, PARTITIONED, LOST), SAVE_KILL_AND_RESCHEDULE) - .put(new TestCase(false, PARTITIONED, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(true, LOST, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, LOST, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(true, LOST, STARTING), ILLEGAL_KILL) @@ -559,12 +541,9 @@ public class TaskStateMachineTest { .put(new TestCase(true, LOST, RUNNING), ILLEGAL_KILL) .put(new TestCase(false, LOST, RUNNING), ILLEGAL_KILL) .put(new TestCase(true, LOST, DELETED), DELETE_TASK) - .put(new TestCase(false, LOST, PARTITIONED), ILLEGAL_KILL) - .put(new TestCase(true, LOST, PARTITIONED), ILLEGAL_KILL) .put(new TestCase(false, DELETED, ASSIGNED), ILLEGAL_KILL) .put(new TestCase(false, DELETED, STARTING), ILLEGAL_KILL) .put(new TestCase(false, DELETED, RUNNING), ILLEGAL_KILL) - .put(new TestCase(false, DELETED, PARTITIONED), ILLEGAL_KILL) .build(); @Test
