AMBARI-20269. Failed task during EU is not reported upfront causing Upgrade to show 'Aborted' after Finalize step. (mpapirkovskyy)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/0e4819ae Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/0e4819ae Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/0e4819ae Branch: refs/heads/branch-feature-AMBARI-12556 Commit: 0e4819ae7b45fc659067b83d1cba0f97435528bd Parents: aebe216 Author: Myroslav Papirkovskyi <[email protected]> Authored: Thu Mar 2 21:12:05 2017 +0200 Committer: Myroslav Papirkovskyi <[email protected]> Committed: Fri Mar 3 11:45:25 2017 +0200 ---------------------------------------------------------------------- .../server/actionmanager/ActionDBAccessor.java | 4 +- .../actionmanager/ActionDBAccessorImpl.java | 11 +++-- .../server/actionmanager/ActionScheduler.java | 30 ++---------- .../actionmanager/TestActionScheduler.java | 51 +++++++++----------- 4 files changed, 37 insertions(+), 59 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java index 217fe0a..9325d03 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessor.java @@ -72,8 +72,8 @@ public interface ActionDBAccessor { /** * Mark the task as to have timed out */ - void timeoutHostRole(String host, long requestId, long stageId, - String role, boolean skipSupported); + void timeoutHostRole(String host, long requestId, long stageId, String role, + boolean skipSupported, boolean hostUnknownState); /** * Returns all the pending stages, including queued and not-queued. A stage is http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java index b813fe6..14f02d2 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionDBAccessorImpl.java @@ -241,20 +241,21 @@ public class ActionDBAccessorImpl implements ActionDBAccessor { @Override public void timeoutHostRole(String host, long requestId, long stageId, String role) { - timeoutHostRole(host, requestId, stageId, role, false); + timeoutHostRole(host, requestId, stageId, role, false, false); } @Override - public void timeoutHostRole(String host, long requestId, long stageId, - String role, boolean skipSupported) { + public void timeoutHostRole(String host, long requestId, long stageId, String role, + boolean skipSupported, boolean hostUnknownState) { long now = System.currentTimeMillis(); List<HostRoleCommandEntity> commands = - hostRoleCommandDAO.findByHostRole(host, requestId, stageId, role); + hostRoleCommandDAO.findByHostRole(host, requestId, stageId, role); for (HostRoleCommandEntity command : commands) { if (skipSupported) { command.setStatus(HostRoleStatus.SKIPPED_FAILED); } else { - command.setStatus(command.isRetryAllowed() ? HostRoleStatus.HOLDING_TIMEDOUT : HostRoleStatus.TIMEDOUT); + command.setStatus(command.isRetryAllowed() ? HostRoleStatus.HOLDING_TIMEDOUT : + hostUnknownState ? HostRoleStatus.ABORTED : HostRoleStatus.TIMEDOUT); } command.setEndTime(now); http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java index a92c03c..c8e4532 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionScheduler.java @@ -817,6 +817,7 @@ class ActionScheduler implements Runnable { } // Check that service host component is not deleted + boolean isHostStateUnknown = false; if (hostDeleted) { String message = String.format( @@ -834,9 +835,10 @@ class ActionScheduler implements Runnable { processActionDeath(cluster.getClusterName(), c.getHostname(), roleStr); } status = HostRoleStatus.ABORTED; - } else if (timeOutActionNeeded(status, s, hostObj, roleStr, now, commandTimeout)) { + } else if (timeOutActionNeeded(status, s, hostObj, roleStr, now, commandTimeout) + || (isHostStateUnknown = isHostStateUnknown(s, hostObj, roleStr))) { // Process command timeouts - if (s.getAttemptCount(host, roleStr) >= maxAttempts) { + if (s.getAttemptCount(host, roleStr) >= maxAttempts || isHostStateUnknown) { LOG.warn("Host: {}, role: {}, actionId: {} expired and will be failed", host, roleStr, s.getActionId()); @@ -847,7 +849,7 @@ class ActionScheduler implements Runnable { isSkipSupported = hostRoleCommand.isFailureAutoSkipped(); } - db.timeoutHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), isSkipSupported); + db.timeoutHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), isSkipSupported, isHostStateUnknown); //Reinitialize status status = s.getHostRoleStatus(host, roleStr); @@ -876,28 +878,6 @@ class ActionScheduler implements Runnable { commandsToSchedule.add(c); LOG.trace("===> commandsToSchedule(reschedule)=" + commandsToSchedule.size()); } - } else if (isHostStateUnknown(s, hostObj, roleStr)) { - String message = "Action was aborted due agent is not heartbeating or was restarted."; - LOG.warn("Host: {}, role: {}, actionId: {} . {}", host, roleStr, - s.getActionId(), message); - - db.abortHostRole(host, s.getRequestId(), s.getStageId(), c.getRole(), message); - - if (null != cluster) { - if (!RoleCommand.CUSTOM_COMMAND.equals(c.getRoleCommand()) - && !RoleCommand.SERVICE_CHECK.equals(c.getRoleCommand()) - && !RoleCommand.ACTIONEXECUTE.equals(c.getRoleCommand())) { - //commands above don't affect host component state (e.g. no in_progress state in process), transition will fail - transitionToFailedState(cluster.getClusterName(), c.getServiceName(), roleStr, host, now, false); - } - if (c.getRoleCommand().equals(RoleCommand.ACTIONEXECUTE)) { - processActionDeath(cluster.getClusterName(), c.getHostname(), roleStr); - } - } - - // Dequeue command - LOG.info("Removing command from queue, host={}, commandId={} ", host, c.getCommandId()); - actionQueue.dequeue(host, c.getCommandId()); } else if (status.equals(HostRoleStatus.PENDING)) { // in case of DEPENDENCY_ORDERED stage command can be scheduled only if all of it's dependencies are // already finished http://git-wip-us.apache.org/repos/asf/ambari/blob/0e4819ae/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java b/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java index 526ca7c..718c7ce 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/actionmanager/TestActionScheduler.java @@ -335,7 +335,7 @@ public class TestActionScheduler { command.setStatus(HostRoleStatus.TIMEDOUT); return null; } - }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean()); + }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(), eq(false)); //Small action timeout to test rescheduling @@ -416,32 +416,18 @@ public class TestActionScheduler { doAnswer(new Answer<Void>() { @Override public Void answer(InvocationOnMock invocation) throws Throwable { - Long requestId = (Long) invocation.getArguments()[1]; - for (Stage stage : stages) { - if (requestId.equals(stage.getRequestId())) { - for (HostRoleCommand command : stage.getOrderedHostRoleCommands()) { - if (command.getStatus() == HostRoleStatus.QUEUED || - command.getStatus() == HostRoleStatus.IN_PROGRESS || - command.getStatus() == HostRoleStatus.PENDING) { - command.setStatus(HostRoleStatus.ABORTED); - } - } - } - } - + String host = (String) invocation.getArguments()[0]; + String role = (String) invocation.getArguments()[3]; + HostRoleCommand command = s.getHostRoleCommand(host, role); + command.setStatus(HostRoleStatus.ABORTED); return null; } - }).when(db).abortHostRole(anyString(), anyLong(), anyLong(), anyString(), anyString()); + }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(), eq(true)); //Small action timeout to test rescheduling AmbariEventPublisher aep = EasyMock.createNiceMock(AmbariEventPublisher.class); - ActionScheduler scheduler = EasyMock.createMockBuilder(ActionScheduler.class). - withConstructor((long) 100, (long) 50, db, aq, fsm, 3, - new HostsMap((String) null), unitOfWork, aep, conf, entityManagerProviderMock, - mock(HostRoleCommandDAO.class), mock(HostRoleCommandFactory.class)). - addMockedMethod("cancelHostRoleCommands"). - createMock(); - EasyMock.replay(scheduler); + ActionScheduler scheduler = new ActionScheduler(100, 0, db, aq, fsm, 3, + new HostsMap((String) null), unitOfWork, null, conf, entityManagerProviderMock, hostRoleCommandDAOMock, null); scheduler.setTaskTimeoutAdjustment(false); int cycleCount=0; @@ -452,7 +438,7 @@ public class TestActionScheduler { Assert.assertEquals(HostRoleStatus.ABORTED,stages.get(0).getHostRoleStatus(hostname, "NAMENODE")); - EasyMock.verify(scheduler, entityManagerProviderMock); + EasyMock.verify(entityManagerProviderMock); } @Test @@ -499,7 +485,7 @@ public class TestActionScheduler { when(serviceObj.getCluster()).thenReturn(oneClusterMock); final List<Stage> stages = new ArrayList<Stage>(); - Stage stage = stageFactory.createNew(1, "/tmp", "cluster1", 1L, "stageWith2Tasks", + final Stage stage = stageFactory.createNew(1, "/tmp", "cluster1", 1L, "stageWith2Tasks", CLUSTER_HOST_INFO, "{\"command_param\":\"param_value\"}", "{\"host_param\":\"param_value\"}"); addInstallTaskToStage(stage, hostname1, "cluster1", Role.DATANODE, RoleCommand.INSTALL, Service.Type.HDFS, 1); @@ -518,10 +504,21 @@ public class TestActionScheduler { HostRoleCommandDAO hostRoleCommandDAOMock = mock(HostRoleCommandDAO.class); Mockito.doNothing().when(hostRoleCommandDAOMock).publishTaskCreateEvent(anyListOf(HostRoleCommand.class)); + doAnswer(new Answer<Void>() { + @Override + public Void answer(InvocationOnMock invocation) throws Throwable { + String host = (String) invocation.getArguments()[0]; + String role = (String) invocation.getArguments()[3]; + HostRoleCommand command = stage.getHostRoleCommand(host, role); + command.setStatus(HostRoleStatus.ABORTED); + return null; + } + }).when(db).timeoutHostRole(anyString(), anyLong(), anyLong(), anyString(), anyBoolean(), eq(true)); + doAnswer(new Answer<Collection<HostRoleCommandEntity>>() { @Override public Collection<HostRoleCommandEntity> answer(InvocationOnMock invocation) throws Throwable { - Long requestId = (Long) invocation.getArguments()[1]; + Long requestId = (Long) invocation.getArguments()[0]; List<HostRoleCommandEntity> abortedCommands = Lists.newArrayList(); for (Stage stage : stages) { if (requestId.equals(stage.getRequestId())) { @@ -542,7 +539,7 @@ public class TestActionScheduler { return abortedCommands; } - }).when(db).abortHostRole(anyString(), anyLong(), anyLong(), anyString(), anyString()); + }).when(db).abortOperation(anyLong()); ArgumentCaptor<ServiceComponentHostEvent> eventsCapture1 = ArgumentCaptor.forClass(ServiceComponentHostEvent.class); @@ -954,7 +951,7 @@ public class TestActionScheduler { boolean taskShouldBeSkipped = stageSupportsAutoSkip && autoSkipFailedTask; db.timeoutHostRole(EasyMock.anyString(), EasyMock.anyLong(), EasyMock.anyLong(), - EasyMock.anyString(), EasyMock.eq(taskShouldBeSkipped)); + EasyMock.anyString(), EasyMock.eq(taskShouldBeSkipped), EasyMock.anyBoolean()); EasyMock.expectLastCall();
